o
    Hhv6                     @   s   U d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	m
Z
mZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ g Zee ed< e eZG d	d
 d
ejZdee defddZdS )    N)
CollectionMapping)deepcopy)AnyCallableOptionaloverloadUnion)optim)ShardedTensor)FullyShardedDataParallel__all__c                   @   sT  e Zd ZdZ		d"deeeeje	f f de
jdeeeeef   deej ddf
dd	Zd
d Zdeeef fddZed#d$ddZedeg ef defddZd%deeg ef  dee fddZedeejef fddZdeeef ddfddZdeeef ddfddZd&ddZdeeef fddZdeeef fd d!ZdS )'_NamedOptimizera  
    ``_NamedOptimizer`` takes a dict of parameters and exposes ``state_dict`` by parameter key.

    We replace the original key (number) in an optim to the
    fully qualified name (FQN) string. User can initialize the optim as they
    initialize a PyTorch optim, the only difference is that they also need to
    pass in the FQN of each parameters.

    Args:
        named_parameters (Mapping[str, Union[torch.Tensor, ShardedTensor]]):
            Mapping from FQN to parameter.
        optimizer_class (optim.Optimizer):
            The class of optimizer to instantiate.
        param_groups (Collection[Mapping[str, Any]]):
            `param_groups` to pass to optimizer if specified.
            The key of the inner map needs to be FQNs.
            Default: None
        module (nn.Module): the module whose parameters to updated
            by the optimizer.
        args: arguments to pass to the optimizer constructor.
        kwargs: arguments to pass to the optimizer constructor.

    Example::
        >>> # xdoctest: +SKIP("distributed")
        >>> from torch import optim
        >>> from torch.distributed.optim import _NamedOptimizer
        >>>
        >>> # Define the named optimizer.
        >>> m = Model(...)
        >>> named_optim = _NamedOptimizer(m.named_parameters(), optim.SGD)
        >>> # Forward pass + backward pass.
        >>> named_optim.step()
        >>> ...
        >>> # Call state_dict for the named optimizer returns a FQN state_dict.
        >>> named_optim.state_dict()

    Warning: This API is still in development and subject to change.

    TODO: Add tutorial for _NamedOptimizer.
    TODO: Add documentation in the docstring for the public attributes
          like self.param_groups and self.named_parameters.
    Nnamed_parametersoptimizer_classparam_groupsmodulereturnc                 O   s   t jd || _|   t|| _|d u r| j n|}||g|R i || _|| _	|d u r9t
| j | _n3td dd | j D }g }	|D ]}
|
d D ]}||vr`td| d|	||  qRqL|	| _| jj| _d S )Nz'torch.distributed.optim._NamedOptimizerzvSince we pass in param_groups, we will use param_groups to initialize the optimizer, not all parameters of the module.c                 S      i | ]\}}||qS  r   .0keyparamr   r   [/var/www/vscode/kcb/lib/python3.10/site-packages/torch/distributed/optim/named_optimizer.py
<dictcomp>]       z,_NamedOptimizer.__init__.<locals>.<dictcomp>paramszExpect param name z% found in param group but is missing.)torch_C_log_api_usage_oncer   _param_groups_checkdictr   values
_optimizerr   listkeysordered_param_keyswarningswarnitems
ValueErrorappend)selfr   r   r   r   argskwargsparams_for_optimizerparam_to_keyr'   groupr   r   r   r   __init__@   s>   	

z_NamedOptimizer.__init__c                 C   s   | j d urE| j D ]>}t|tsJ dd|v sJ d|d }t|tjr(|g}t|}|D ]}t|tjs?tdt| q.||d< qd S d S )Nparam group must be a dictr   z#param group must contain key paramsz>optimizer can only optimize Tensors, but one of the params is )r   
isinstancer"   r   Tensorr%   	TypeErrortypename)r-   param_groupr   r   r   r   r   r!   j   s&   


z#_NamedOptimizer._param_groups_checkc           
         s    j  }|d } fdd|d  D }g }|D ]+} fdd|d D }dt|i}| D ]\}}	|dkr?t|	||< q1|| q ||dS )	z
        Return the ``state_dict`` of the optimizer.

        Instead of using number to index
        parameters, we will use module fully qualified name (FQN) as the key.
        r   c                    s   i | ]
\}} j | |qS r   r'   )r   st_key	state_valr-   r   r   r      s    
z._NamedOptimizer.state_dict.<locals>.<dictcomp>statec                    s   g | ]} j | qS r   r:   )r   r   r=   r   r   
<listcomp>   r   z._NamedOptimizer.state_dict.<locals>.<listcomp>r   )r>   r   )r$   
state_dictr*   sortedr   r,   _post_state_dict)
r-   r@   r   	ret_state
ret_groupsr2   
param_keys	ret_groupkvr   r=   r   r@   {   s   


z_NamedOptimizer.state_dict.closurec                 C      d S Nr   r-   rI   r   r   r   step      z_NamedOptimizer.stepc                 C   rJ   rK   r   rL   r   r   r   rM      rN   c                 C   s   | j j|dS )z
        Perform a single optimization step.

        This will call :meth:`torch.optim.Optimizer.step` on the wrapped
        optimizer.
        rI   )r$   rM   rL   r   r   r   rM      s   c                 C   s   | j jS rK   )r$   r>   r=   r   r   r   r>      s   z_NamedOptimizer.stater@   c                 C   s  | j  }| |}|d }|d }t|dkrtdt| jD ]\}}|| vr,q!t|| t|| krMtdt||  d| dt||  ||  D ]}\}}||| vrhtd| d| d|| | }	t	|t
rt	|	t
szJ t| }
t|	 }|
|krtd	| d
|
 d| d| t| |	 D ]\}}|j |j qqSt	|tjrt	|	tjsJ | |	 qSt|	|| |< qSq!|d }|d }i }|D ]}t|d }||t|< qi }|D ]}g }|d D ]
}|| j|  q||t|< q| D ]N\}}||vrq|| }t|t|kr9tdt| d| d
t| d|D ] }||vrMtd| d| d|dkrZt|| ||< q;q| j | dS )a  
        Define the default behavior to load a state_dict for ``_NamedOptimizer``.

        Sample Code
        ```
            my_model = MyModule()
            optimizer = _NamedOptimizer(my_model.named_parameters(), Adagrad)
            ...

            optim_state_dict = optimizer.state_dict()
            ...
            ...

            optimizer.load_state_dict(optim_state_dict)
            ...
        ```
        Args:
            state_dict (Dict[str, Any]) : A ``state_dict`` to load into the optimizer.
                Note that this state dict update is performed in place.

        .. note:: PyTorch is using lazy init to initialize the optim states.
            So it is possible that there is no optim state when user call
            ``load_state_dict`` and for ``_NamedOptimizer`` we make it stricter
            that users can only call ``load_state_dict`` after the state is initialized.
            By doing this, we can validate the optim ``state_dict`` to be loaded.
        r>   r   zJExpects the optim to be initialized before load but found not initialized.zExpects equal length as z for parameter z but found: zExpects state z but not found.z"Expects equal number of shards as z but found z for /r   r   z"Expects equal param_group size as z for group .zExpects group key z to be in group z  in `state_dict` but is missing.N)r$   r@   _pre_load_state_dictlenr+   	enumerater'   r&   r*   r5   r   local_shardsziptensordetachcopy_r   r6   r   r%   _gen_param_group_keyr,   load_state_dict)r-   r@   new_state_dictr>   	new_stateidx	param_key	state_keyr<   src_state_val
num_shardsnum_new_shardsshard	src_shardsrc_param_groupsnew_param_groupssrc_group_mapr2   rE   new_group_map	new_group	group_key	src_grouprG   r   r   r   r[      s   

$



z_NamedOptimizer.load_state_dictr9   c                 C   s   t |ts	J d|d }t |tjr|g|d< nt||d< dd | j D }|d D ]}||vr7td| j	||  q-| j
| | j
j| _dS )z
        Add a param group to the :class:`_NamedOptimizer` s `param_groups`.

        Warning: This API is still in development and subject to change.
        r4   r   c                 S   r   r   r   r   r   r   r   r     r   z3_NamedOptimizer.add_param_group.<locals>.<dictcomp>z%some parameters are not in the moduleN)r5   r"   r   r6   r%   r   r*   r+   r'   r,   r$   add_param_groupr   )r-   r9   r   r1   r   r   r   r   rm     s   z_NamedOptimizer.add_param_groupc                 C   s>   | j  D ]}|jrt|}tj||_q| jdd dS )z
        Run a dummy optimizer step, which allows to initialize optimizer state because we do lazy init for most optimizers.

        This allows doing in-place loading of optimizer state from a checkpoint.
        NrO   )	r   r#   requires_gradr   
zeros_likeautogradVariablegradrM   )r-   r   tr   r   r   
init_state(  s   
z_NamedOptimizer.init_statec                 C   s&   t | jtrtj| j| j|ddS |S )NT)is_named_optimizer)r5   r   FSDPoptim_state_dict_to_loadr$   r-   r@   r   r   r   rR   5  s
   z$_NamedOptimizer._pre_load_state_dictc                 C   s"   t | jtrt| j| j| |S rK   )r5   r   rv   optim_state_dictr$   rx   r   r   r   rB   >  s   z _NamedOptimizer._post_state_dict)NN).)rI   Nr   NrK   )r   N) __name__
__module____qualname____doc__r   strr	   r   r6   r   r
   	Optimizerr   r   r   nnModuler3   r!   r"   r@   r   rM   r   floatpropertyr>   r[   rm   rt   rR   rB   r   r   r   r   r      s:    /
*$	h
	r   rE   r   c                 C   s   d t| S )zGConcatenate all param keys as a unique indentifier for one param group.rP   )joinrA   )rE   r   r   r   rZ   F  s   rZ   ) loggingr(   collections.abcr   r   copyr   typingr   r   r   r   r	   r   torch.nnr   r
   'torch.distributed._shard.sharded_tensorr   torch.distributed.fsdpr   rv   r   r%   r~   __annotations__	getLoggerrz   loggerr   r   rZ   r   r   r   r   <module>   s    

  4