o
    Wh                      @   sl   d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlmZ dd ZG d	d
 d
ejjZdS )    N)build_yolo_dataset)yolo)
WorldModel)DEFAULT_CFGRANKchecks)de_parallelc                 C   s   t dv rdd t| jjjd  D }t| jjj|dd t	| j
 j}| jjd|d\| _}| j D ]}|d q7d	S )
zUCallback to set up model classes and text encoder at the end of the pretrain routine.>   r   c                 S   s   g | ]	}| d d qS )/r   )split).0name r   W/var/www/vscode/kcb/lib/python3.10/site-packages/ultralytics/models/yolo/world/train.py
<listcomp>   s    z+on_pretrain_routine_end.<locals>.<listcomp>namesF)cache_clip_modelzViT-B/32)deviceN)r   listtest_loaderdatasetdatavaluesr   emaset_classesnextmodel
parametersr   clipload
text_modelrequires_grad_)trainerr   r   _pr   r   r   on_pretrain_routine_end   s    r%   c                       sH   e Zd ZdZeddf fdd	ZdddZdd	d
Z fddZ  Z	S )WorldTrainera@  
    A class to fine-tune a world model on a close-set dataset.

    This trainer extends the DetectionTrainer to support training YOLO World models, which combine
    visual and textual features for improved object detection and understanding.

    Attributes:
        clip (module): The CLIP module for text-image understanding.
        text_model (module): The text encoder model from CLIP.
        model (WorldModel): The YOLO World model being trained.
        data (dict): Dataset configuration containing class information.
        args (dict): Training arguments and configuration.

    Examples:
        >>> from ultralytics.models.yolo.world import WorldModel
        >>> args = dict(model="yolov8s-world.pt", data="coco8.yaml", epochs=3)
        >>> trainer = WorldTrainer(overrides=args)
        >>> trainer.train()
    Nc                    sX   |du ri }t  ||| zddl}W n ty&   td ddl}Y nw || _dS )a  
        Initialize a WorldTrainer object with given arguments.

        Args:
            cfg (dict): Configuration for the trainer.
            overrides (dict, optional): Configuration overrides.
            _callbacks (list, optional): List of callback functions.
        Nr   z+git+https://github.com/ultralytics/CLIP.git)super__init__r   ImportErrorr   check_requirements)selfcfg	overrides
_callbacksr   	__class__r   r   r(   -   s   	

zWorldTrainer.__init__Tc                 C   sZ   t t|tr
|d n|| jd t| jd d|otdkd}|r%|| | dt |S )a^  
        Return WorldModel initialized with specified config and weights.

        Args:
            cfg (Dict | str, optional): Model configuration.
            weights (str, optional): Path to pretrained weights.
            verbose (bool): Whether to display model info.

        Returns:
            (WorldModel): Initialized WorldModel.
        	yaml_filechannelsncP   r	   )chr3   verboser%   )	r   
isinstancedictr   minr   r   add_callbackr%   )r+   r,   weightsr6   r   r   r   r   	get_modelB   s   

zWorldTrainer.get_modeltrainc              
   C   sH   t t| jrt| jj  ndd}t| j||| j||dk||dkdS )a  
        Build YOLO Dataset for training or validation.

        Args:
            img_path (str): Path to the folder containing images.
            mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode.
            batch (int, optional): Size of batches, this is for `rect`.

        Returns:
            (Dataset): YOLO dataset configured for training or validation.
        r       valr=   )moderectstridemulti_modal)maxintr   r   rB   r   argsr   )r+   img_pathr@   batchgsr   r   r   build_dataset\   s   $zWorldTrainer.build_datasetc                    s   t  |}ttj|d  }| j||d j}| j	
|j|d jd}||jdddd }|t|d d|jd |d< |S )	z=Preprocess a batch of images and text for YOLOWorld training.textsimg)dtype   r	   T)r$   dimkeepdim	txt_feats)r'   preprocess_batchr   	itertoolschainr   tokenizetor   r    encode_textrM   normreshapelenshape)r+   rH   rK   
text_tokenrQ   r/   r   r   rR   m   s    zWorldTrainer.preprocess_batch)NNT)r=   N)
__name__
__module____qualname____doc__r   r(   r<   rJ   rR   __classcell__r   r   r/   r   r&      s    

r&   )rS   ultralytics.datar   ultralytics.modelsr   ultralytics.nn.tasksr   ultralytics.utilsr   r   r   ultralytics.utils.torch_utilsr   r%   detectDetectionTrainerr&   r   r   r   r   <module>   s   