
    %	&hx9                     |   d Z ddlmZ ddlmZmZmZ ddlZddlZddlm	Z	 ddl
mZ ddlmZmZmZmZmZ dd	lmZ d
dlmZ  ej,                  e      ZdZe G d de             Z G d de      ZdZdZddZ G d de	j>                        Z  G d de	j>                        Z! ede       G d de             Z"ddgZ#y)zPyTorch VitPose model.    )	dataclass)OptionalTupleUnionN)nn   )PreTrainedModel)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings)load_backbone   )VitPoseConfigr   c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   y)VitPoseEstimatorOutputaQ  
    Class for outputs of pose estimation models.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Loss is not supported at this moment. See https://github.com/ViTAE-Transformer/ViTPose/tree/main/mmpose/models/losses for further detail.
        heatmaps (`torch.FloatTensor` of shape `(batch_size, num_keypoints, height, width)`):
            Heatmaps as predicted by the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
            (also called feature maps) of the model at the output of each stage.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlossheatmaps.hidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r        /var/www/pru.catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/vitpose/modeling_vitpose.pyr   r   *   sq    ( )-D(5$$
%,,0Hhu(()0=AM8E%"3"3S"89:A:>Ju00#567>r    r   c                   x    e Zd ZdZeZdZdZdZde	e
j                  e
j                  e
j                  f   ddfdZy)	VitPosePreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    vitpixel_valuesTmodulereturnNc                    t        |t        j                  t        j                  f      rt        j                  j                  |j                  j                  j                  t        j                        d| j                  j                        j                  |j                  j                        |j                  _        |j                  %|j                  j                  j                          yyt        |t        j                         rJ|j                  j                  j                          |j                  j                  j#                  d       yy)zInitialize the weightsg        )meanstdNg      ?)
isinstancer   LinearConv2dinittrunc_normal_weightdatator   float32configinitializer_rangedtypebiaszero_	LayerNormfill_)selfr&   s     r!   _init_weightsz$VitPosePreTrainedModel._init_weightsQ   s    fryy"))45 "$!6!6""%%emm43DKKDaDa "7 "b$$% MM {{&  &&( '-KK""$MM$$S) .r    )r   r   r   r   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingr   r   r,   r-   r9   r<   r   r    r!   r#   r#   F   sK    
 !L$O&*#*E"))RYY*L$M *RV *r    r#   aI  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`VitPoseConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a4  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`VitPoseImageProcessor`]. See
            [`VitPoseImageProcessor.__call__`] for details.

        dataset_index (`torch.Tensor` of shape `(batch_size,)`):
            Index to use in the Mixture-of-Experts (MoE) blocks of the backbone.

            This corresponds to the dataset index used during training, e.g. For the single dataset index 0 refers to the corresponding dataset. For the multiple datasets index 0 refers to dataset A (e.g. MPII) and index 1 refers to dataset B (e.g. CrowdPose).

        flip_pairs (`torch.tensor`, *optional*):
            Whether to mirror pairs of keypoints (for example, left ear -- right ear).

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                    |dvrt        d      | j                  dk7  rt        d      | j                  \  }}}}d}|dk(  rd}| dddddd	f    | dddddd	f<   | j                  |d
|||      } | j	                         }|j                         D ])  \  }	}
| dd|
d	f   |dd|	d	f<   | dd|	d	f   |dd|
d	f<   + |j                  ||||f      }|j                  d
      }|S )a  Flip the flipped heatmaps back to the original form.

    Args:
        output_flipped (`torch.tensor` of shape `(batch_size, num_keypoints, height, width)`):
            The output heatmaps obtained from the flipped images.
        flip_pairs (`torch.Tensor` of shape `(num_keypoints, 2)`):
            Pairs of keypoints which are mirrored (for example, left ear -- right ear).
        target_type (`str`, *optional*, defaults to `"gaussian-heatmap"`):
            Target type to use. Can be gaussian-heatmap or combined-target.
            gaussian-heatmap: Classification target with gaussian distribution.
            combined-target: The combination of classification target (response map) and regression target (offset map).
            Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).

    Returns:
        torch.Tensor: heatmaps that flipped back to the original image
    )gaussian-heatmapcombined-targetz9target_type should be gaussian-heatmap or combined-target   zCoutput_flipped should be [batch_size, num_keypoints, height, width]r   rC   r   N.)
ValueErrorndimshapereshapeclonetolistflip)output_flipped
flip_pairstarget_type
batch_sizenum_keypointsheightwidthchannelsoutput_flipped_backleftrights              r!   	flip_backrX      s8   " AATUUa^__/=/C/C,JvuH''(6q!$Q$|(D'Dq!$Q$|$#++JHfeTN(..0 "((* Je,:1eS=,IAtSL)-;AtSL-IAucM*J .55z=RXZ_6`a-2226r    c                   |     e Zd ZdZd fdZddej                  deej                     dej                  fdZ xZ	S )	VitPoseSimpleDecoderz
    Simple decoding head consisting of a ReLU activation, 4x upsampling and a 3x3 convolution, turning the
    feature maps into heatmaps.
    r'   c                    t         |           t        j                         | _        t        j
                  |j                  dd      | _        t        j                  |j                  j                  |j                  ddd      | _        y )NbilinearF)scale_factormodealign_cornersr   r   kernel_sizestridepadding)super__init__r   ReLU
activationUpsampler]   
upsamplingr-   backbone_confighidden_size
num_labelsconvr;   r4   	__class__s     r!   re   zVitPoseSimpleDecoder.__init__   se    '')++63F3FZglmII""..0A0AqYZde
	r    hidden_staterN   c                     | j                  |      }| j                  |      }| j                  |      }|t        ||      }|S N)rg   ri   rm   rX   r;   rp   rN   r   s       r!   forwardzVitPoseSimpleDecoder.forward   sC    |4|499\*! :6Hr    )r'   Nrr   )
r   r   r   r   re   r   Tensorr   rt   __classcell__ro   s   @r!   rZ   rZ      s;    

	ELL 	hu||>T 	`e`l`l 	r    rZ   c                   h     e Zd ZdZdef fdZddej                  deej                     fdZ	 xZ
S )VitPoseClassicDecoderz
    Classic decoding head consisting of a 2 deconvolutional blocks, followed by a 1x1 convolution layer,
    turning the feature maps into heatmaps.
    r4   c                    t         |           t        j                  |j                  j
                  ddddd      | _        t        j                  d      | _        t        j                         | _
        t        j                  dddddd      | _        t        j                  d      | _        t        j                         | _        t        j                  d|j                  ddd      | _        y )	N   rD      r   F)ra   rb   rc   r7   r   r`   )rd   re   r   ConvTranspose2drj   rk   deconv1BatchNorm2d
batchnorm1rf   relu1deconv2
batchnorm2relu2r-   rl   rm   rn   s     r!   re   zVitPoseClassicDecoder.__init__   s    ))""..1VW^c
 ..-WWY
))#s!UV]bc..-WWY
IIc6#4#4!AWXY	r    rp   rN   c                    | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }| j                  |      }| j                  |      }|t        ||      }|S rr   )r~   r   r   r   r   r   rm   rX   rs   s       r!   rt   zVitPoseClassicDecoder.forward   s{    ||L1|4zz,/||L1|4zz,/99\*! :6Hr    rr   )r   r   r   r   r   re   r   ru   r   rt   rv   rw   s   @r!   ry   ry      s6    
Z} ZELL hu||>T r    ry   z5The VitPose model with a pose estimation head on top.c                       e Zd Zdeddf fdZ ee       eee	      	 	 	 	 	 	 dde
j                  dee
j                     dee
j                     d	ee
j                     d
ee   dee   dee   deeef   fd              Z xZS )VitPoseForPoseEstimationr4   r'   Nc                    t         |   |       t        |      | _        t	        | j                  j
                  d      st        d      t	        | j                  j
                  d      st        d      t	        | j                  j
                  d      st        d      |j                  rt        |      n
t        |      | _
        | j                          y )Nrk   z0The backbone should have a hidden_size attribute
image_sizez0The backbone should have an image_size attribute
patch_sizez/The backbone should have a patch_size attribute)rd   re   r   backbonehasattrr4   rF   use_simple_decoderrZ   ry   head	post_initrn   s     r!   re   z!VitPoseForPoseEstimation.__init__   s     %f- t}}++];OPPt}}++\:OPPt}}++\:NOO4:4M4M(0ShioSp	 	r    )output_typer=   r%   dataset_indexrN   labelsoutput_attentionsoutput_hidden_statesreturn_dictc                 |   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }d}|t	        d      | j
                  j                  |||||      }	|r|	j                  d   n|	d   d   }
|
j                  d   }| j                   j                  j                  d   | j                   j                  j                  d   z  }| j                   j                  j                  d   | j                   j                  j                  d   z  }|
j                  ddd      j                  |d||      j                         }
| j                  |
|      }|s|r
|f|	dd z   }n	|f|	dd z   }||f|z   S |S t!        |||	j"                  |	j$                  	      S )
aT  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, VitPoseForPoseEstimation
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> processor = AutoImageProcessor.from_pretrained("usyd-community/vitpose-base-simple")
        >>> model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
        >>> inputs = processor(image, boxes=boxes, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)
        >>> heatmaps = outputs.heatmaps
        ```NzTraining is not yet supported)r   r   r   r   rE   r   r   r|   )rN   )r   r   r   r   )r4   use_return_dictr   r   NotImplementedErrorr   forward_with_filtered_kwargsfeature_mapsrH   rj   r   r   permuterI   
contiguousr   r   r   r   )r;   r%   r   rN   r   r   r   r   r   outputssequence_outputrP   patch_heightpatch_widthr   outputs                   r!   rt   z VitPoseForPoseEstimation.forward  s   H &1%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq%&EFF--<<'!5/# = 
 7B'..r2wqzRT~$**1-
{{22==a@DKKD_D_DjDjklDmmkk11<<Q?4;;C^C^CiCijkCll##Aq!,44Z\S^_jjl 	 99_9D#"wqr{2"wqr{2)-)9TGf$EvE%!//))	
 	
r    )NNNNNN)r   r   r   r   re   r   VITPOSE_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCr   ru   r   boolr   tuplert   rv   rw   s   @r!   r   r      s    
}  $ ++CD+AP_` 15-1)-,0/3&*K
llK
  -K
 U\\*	K

 &K
 $D>K
 'tnK
 d^K
 
u,,	-K
 a EK
r    r   )rB   )$r   dataclassesr   typingr   r   r   r   torch.utils.checkpointr   modeling_utilsr	   utilsr
   r   r   r   r   utils.backbone_utilsr   configuration_vitposer   
get_loggerr   loggerr   r   r#   VITPOSE_START_DOCSTRINGr   rX   ModulerZ   ry   r   __all__r   r    r!   <module>r      s     ! ) )    -  2 0 
		H	% " ?[ ? ?6*_ *4	  2%P299 6#BII #L ;`
5 `
	`
F $%?
@r    