o
    WhR                     @   sZ   d dl Zd dlZd dlmZ d dlmZ d dlmZ G dd deZ	G dd de	eZ
dS )	    N)LoadVisualPrompt)DetectionPredictor)SegmentationPredictorc                       sX   e Zd ZdZd fdd	Zdd Z fddZd fd
d	Z fddZdd Z	  Z
S )YOLOEVPDetectPredictoraF  
    A mixin class for YOLO-EVP (Enhanced Visual Prompting) predictors.

    This mixin provides common functionality for YOLO models that use visual prompting, including
    model setup, prompt handling, and preprocessing transformations.

    Attributes:
        model (torch.nn.Module): The YOLO model for inference.
        device (torch.device): Device to run the model on (CPU or CUDA).
        prompts (dict): Visual prompts containing class indices and bounding boxes or masks.

    Methods:
        setup_model: Initialize the YOLO model and set it to evaluation mode.
        set_return_vpe: Set whether to return visual prompt embeddings.
        set_prompts: Set the visual prompts for the model.
        pre_transform: Preprocess images and prompts before inference.
        inference: Run inference with visual prompts.
    Tc                    s   t  j||d d| _dS )z
        Sets up the model for prediction.

        Args:
            model (torch.nn.Module): Model to load or use.
            verbose (bool): If True, provides detailed logging.
        )verboseTN)supersetup_modeldone_warmup)selfmodelr   	__class__ Y/var/www/vscode/kcb/lib/python3.10/site-packages/ultralytics/models/yolo/yoloe/predict.pyr       s   
z"YOLOEVPDetectPredictor.setup_modelc                 C   s
   || _ dS )z
        Set the visual prompts for the model.

        Args:
            prompts (dict): Dictionary containing class indices and bounding boxes or masks.
                Must include a 'cls' key with class indices.
        N)prompts)r
   r   r   r   r   set_prompts+   s   
z"YOLOEVPDetectPredictor.set_promptsc                    s  t  jdd jdd}jd tdkrBd jdd d jdd  |}|dj	_S  dusNJ d  d	t
 tr\td
d  D sdJ d  d	t
trrtdd D szJ d d	tt  krt ksn J dt dt dt  d	 fddttD }tjjjj|ddj	_S )a  
        Preprocess images and prompts before inference.

        This method applies letterboxing to the input image and transforms the visual prompts
        (bounding boxes or masks) accordingly.

        Args:
            im (list): List containing a single input image.

        Returns:
            (list): Preprocessed image ready for model inference.

        Raises:
            ValueError: If neither valid bounding boxes nor masks are provided in the prompts.
        bboxesNmaskscls   r      zExpected bboxes, but got !c                 s       | ]	}t |tjV  qd S N
isinstancenpndarray.0br   r   r   	<genexpr>P       z7YOLOEVPDetectPredictor.pre_transform.<locals>.<genexpr>z#Expected List[np.ndarray], but got c                 s   r   r   r   r   r   r   r   r!   S   r"   z-Expected same length for all inputs, but got vsc              	      s@   g | ]} | jd d | jd d |  | qS )Nr   )_process_single_imageshape)r   ir   categoryimimgr
   r   r   
<listcomp>Y   s    2z8YOLOEVPDetectPredictor.pre_transform.<locals>.<listcomp>T)batch_first)r   pre_transformr   poplenr$   r%   	unsqueezetodevicer   listallrangetorchnnutilsrnnpad_sequence)r
   r)   r   visualsr   r'   r   r-   5   s.   
.

$"
z$YOLOEVPDetectPredictor.pre_transformNc                    s  |durit |ritj|tjd}|jdkr|dddf }t|d |d  |d |d  }||9 }|ddddf  t|d |d |  d d 7  < |ddddf  t|d |d |  d d 7  < n|durt |}t	|}d||dk< nt
d	t ||||S )
a  
        Processes a single image by resizing bounding boxes or masks and generating visuals.

        Args:
            dst_shape (tuple): The target shape (height, width) of the image.
            src_shape (tuple): The original shape (height, width) of the image.
            category (str): The category of the image for visual prompts.
            bboxes (list | np.ndarray, optional): A list of bounding boxes in the format [x1, y1, x2, y2]. Defaults to None.
            masks (np.ndarray, optional): A list of masks corresponding to the image. Defaults to None.

        Returns:
            visuals: The processed visuals for the image.

        Raises:
            ValueError: If neither `bboxes` nor `masks` are provided.
        N)dtyper   r   .r   g?r   z$Please provide valid bboxes or masks)r/   r   arrayfloat32ndimminroundr   r-   stack
ValueErrorr   get_visuals)r
   	dst_shape	src_shaper(   r   r   gainresized_masksr   r   r   r$   a   s   
"68
z,YOLOEVPDetectPredictor._process_single_imagec                    s    t  j|g|R d| ji|S )a/  
        Run inference with visual prompts.

        Args:
            im (torch.Tensor): Input image tensor.
            *args (Any): Variable length argument list.
            **kwargs (Any): Arbitrary keyword arguments.

        Returns:
            (torch.Tensor): Model prediction results.
        vpe)r   	inferencer   )r
   r)   argskwargsr   r   r   rK      s    z YOLOEVPDetectPredictor.inferencec                 C   sT   |  | t| jdksJ d| jD ]\}}}| |}| j|| jdd  S dS )a  
        Processes the source to get the visual prompt embeddings (VPE).

        Args:
            source (str | Path | int | PIL.Image | np.ndarray | torch.Tensor | List | Tuple): The source
                of the image to make predictions on. Accepts various types including file paths, URLs, PIL
                images, numpy arrays, and torch tensors.

        Returns:
            (torch.Tensor): The visual prompt embeddings (VPE) from the model.
        r   z get_vpe only supports one image!T)rJ   
return_vpeN)setup_sourcer/   dataset
preprocessr   r   )r
   source_im0sr)   r   r   r   get_vpe   s   

zYOLOEVPDetectPredictor.get_vpe)T)NN)__name__
__module____qualname____doc__r   r   r-   r$   rK   rU   __classcell__r   r   r   r   r      s    
,%r   c                   @   s   e Zd ZdZdS )YOLOEVPSegPredictorz$Predictor for YOLOE VP segmentation.N)rV   rW   rX   rY   r   r   r   r   r[      s    r[   )numpyr   r6   ultralytics.data.augmentr   ultralytics.models.yolo.detectr   ultralytics.models.yolo.segmentr   r   r[   r   r   r   r   <module>   s    