o
    Vh#                     @   sh   d dl Z d dlmZ d dlmZ d dlmZmZ d dlm	Z	 d dl
mZ ddlmZ G d	d
 d
eZdS )    N)Image)SegmentationPredictor)DEFAULT_CFGchecks)box_iou)scale_masks   )adjust_bboxes_to_image_borderc                       sN   e Zd ZdZeddf fdd	Z fddZdddZd	d
 Zdd Z	  Z
S )FastSAMPredictora[  
    FastSAMPredictor is specialized for fast SAM (Segment Anything Model) segmentation prediction tasks.

    This class extends the SegmentationPredictor, customizing the prediction pipeline specifically for fast SAM. It
    adjusts post-processing steps to incorporate mask prediction and non-maximum suppression while optimizing for
    single-class segmentation.

    Attributes:
        prompts (dict): Dictionary containing prompt information for segmentation (bboxes, points, labels, texts).
        device (torch.device): Device on which model and tensors are processed.
        clip_model (Any, optional): CLIP model for text-based prompting, loaded on demand.
        clip_preprocess (Any, optional): CLIP preprocessing function for images, loaded on demand.

    Methods:
        postprocess: Applies box postprocessing for FastSAM predictions.
        prompt: Performs image segmentation inference based on various prompt types.
        _clip_inference: Performs CLIP inference to calculate similarity between images and text prompts.
        set_prompts: Sets prompts to be used during inference.
    Nc                    s   t  ||| i | _dS )ac  
        Initialize the FastSAMPredictor with configuration and callbacks.

        This initializes a predictor specialized for Fast SAM (Segment Anything Model) segmentation tasks. The predictor
        extends SegmentationPredictor with custom post-processing for mask prediction and non-maximum suppression
        optimized for single-class segmentation.

        Args:
            cfg (dict): Configuration for the predictor. Defaults to Ultralytics DEFAULT_CFG.
            overrides (dict, optional): Configuration overrides.
            _callbacks (list, optional): List of callback functions.
        N)super__init__prompts)selfcfg	overrides
_callbacks	__class__ V/var/www/vscode/kcb/lib/python3.10/site-packages/ultralytics/models/fastsam/predict.pyr   #   s   
zFastSAMPredictor.__init__c                    s   | j dd}| j dd}| j dd}| j dd}t |||}|D ]:}	tjdd|	jd |	jd g|d jtjd}
t	|	j
j|	j}tt|
d |d	k }| dkr`|
|	j
j|< q&| j|||||d
S )a  
        Apply postprocessing to FastSAM predictions and handle prompts.

        Args:
            preds (List[torch.Tensor]): Raw predictions from the model.
            img (torch.Tensor): Input image tensor that was fed to the model.
            orig_imgs (List[numpy.ndarray]): Original images before preprocessing.

        Returns:
            (List[Results]): Processed results with prompts applied.
        bboxesNpointslabelstextsr   r   )devicedtypeg?)r   r   r   r   )r   popr   postprocesstorchtensor
orig_shaper   float32r	   boxesxyxynonzeror   flattennumelprompt)r   predsimg	orig_imgsr   r   r   r   resultsresultfull_boxr"   idxr   r   r   r   3   s   "zFastSAMPredictor.postprocessc                    s  |du r|du r|du r|S g }t |ts|g}|D ]}t|dkr)|| q|jj  jdd |jkrAt d |jd  t	j
t|t	j| jd}|durt	j|t	j| jd}|jdkrd|d n|}|dddf |dddf  |dddf |dddf   }	t	 fdd|D }
t	j d	d
}|	dddf | |
 }d|t	j|
| dd
< |dur<t	j|t	j| jd}|jdkr|d n|}|du rt	|jd }t	j|t	j| jd}t|t|ksJ dt| dt| | dkr	t	jt|t	j| jdnt	j
t|t	j| jd}t||D ]\}}t||t	j dd|d |d f ddd < q||O }|durt |trJ|g}g g }}t|jj D ]5\}}dd |D \}}}} |  dkrv|| qW|t|j||||dddf  qW| ||}t	j|dd
}t|r|t	j|| jdd t |kd7 }d||< |||  q|S )a  
        Perform image segmentation inference based on cues like bounding boxes, points, and text prompts.

        Args:
            results (Results | List[Results]): Original inference results from FastSAM models without any prompts.
            bboxes (np.ndarray | List, optional): Bounding boxes with shape (N, 4), in XYXY format.
            points (np.ndarray | List, optional): Points indicating object locations with shape (N, 2), in pixels.
            labels (np.ndarray | List, optional): Labels for point prompts, shape (N, ). 1 = foreground, 0 = background.
            texts (str | List[str], optional): Textual prompts, a list containing string objects.

        Returns:
            (List[Results]): Output results filtered and determined by the provided prompts.
        Nr   r   )r   r         c                    s>   g | ]} d d |d |d |d |d f j ddqS )Nr   r/   r   r0   r   r0   dim)sum).0bmasksr   r   
<listcomp>o   s   > z+FastSAMPredictor.prompt.<locals>.<listcomp>r1   r2   Tz4Excepted `labels` got same size as `point`, but got z and )as_tuplec                 s   s    | ]}t |V  qd S )N)int)r5   xr   r   r   	<genexpr>   s    z*FastSAMPredictor.prompt.<locals>.<genexpr>d   r   )!
isinstancelistlenappendr8   datashaper    r   r   zerosboolr   	as_tensorint32ndimstackr4   argmaxoneszipr$   str	enumerater"   r#   tolistr   	fromarrayorig_img_clip_inferencer   r;   )r   r+   r   r   r   r   prompt_resultsr,   r.   
bbox_areas
mask_areasfull_mask_areasunion	point_idxpointlabelcrop_ims
filter_idxir6   x1y1x2y2
similaritytext_idxr   r7   r   r'   O   sl   


@
4


.
&zFastSAMPredictor.promptc                    s   zddl }W n ty   td ddl }Y nw t dr#t ds/|jd jd\ _ _t	
 fdd	|D }|| j} j|} j|}||jd
dd }||jd
dd }||dddf  d
S )a  
        Perform CLIP inference to calculate similarity between images and text prompts.

        Args:
            images (List[PIL.Image]): List of source images, each should be PIL.Image with RGB channel order.
            texts (List[str]): List of prompt texts, each should be a string object.

        Returns:
            (torch.Tensor): Similarity matrix between given images and texts with shape (M, N).
        r   Nz+git+https://github.com/ultralytics/CLIP.git
clip_modelclip_preprocesszViT-B/32r@   c                    s   g | ]}  | jqS r   )rh   tor   )r5   imager   r   r   r9      s    z4FastSAMPredictor._clip_inference.<locals>.<listcomp>r?   T)r3   keepdim)clipImportErrorr   check_requirementshasattrloadr   rg   rh   r   rL   tokenizeri   encode_imageencode_textnormr4   )r   imagesr   rm   tokenized_textimage_featurestext_featuresr   rk   r   rU      s   
z FastSAMPredictor._clip_inferencec                 C   s
   || _ dS )z(Set prompts to be used during inference.N)r   )r   r   r   r   r   set_prompts   s   
zFastSAMPredictor.set_prompts)NNNN)__name__
__module____qualname____doc__r   r   r   r'   rU   rz   __classcell__r   r   r   r   r
      s    
Jr
   )r   PILr   ultralytics.models.yolo.segmentr   ultralytics.utilsr   r   ultralytics.utils.metricsr   ultralytics.utils.opsr   utilsr	   r
   r   r   r   r   <module>   s   