
    %	&hXQ                        d dl mZ d dlZd dlmZ d dlmc mZ d dlmZm	Z	m
Z
 d dlmZmZmZ d dlmZmZmZmZmZmZmZmZmZmZmZmZmZ ddlmZ  G d d	e      Z G d
 de      Z  G d de      Z! G d de      Z" G d de      Z# G d de      Z$ G d dejJ                        Z& G d de      Z' G d de      Z( G d de      Z) G d de      Z* G d de      Z+ G d  d!e      Z, G d" d#e      Z-g d$Z.y)%    )OptionalN)BCEWithLogitsLossCrossEntropyLossMSELoss)SiglipConfigSiglipTextConfigSiglipVisionConfig)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputSiglipForImageClassificationSiglipModel#SiglipMultiheadAttentionPoolingHeadSiglipOutputSiglipPreTrainedModelSiglipTextModelSiglipTextModelOutputSiglipVisionModelSiglipVisionModelOutputSiglipVisionTransformer   )_prepare_4d_attention_maskc                       e Zd Zy)Siglip2TextConfigN__name__
__module____qualname__     /var/www/pru.catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/siglip2/modular_siglip2.pyr   r   *       r    r   c                   8     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )Siglip2VisionConfigaO  
    This is the configuration class to store the configuration of a [`Siglip2VisionModel`]. It is used to instantiate a
    Siglip2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip2
    [google/siglip2-base-patch16-naflex](https://huggingface.co/google/siglip2-base-patch16-naflex) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            Number of channels in the input images.
        num_patches (`int`, *optional*, defaults to 256):
            The number of patches in the image with the size of (`patch_size`, `patch_size`).
            The image is resized to fill maximum of this number of patches, and to preserve
            the aspect ratio. In case the resulted number of patches is lower, the image is
            padded in "patch" dimension.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.

    Example:

    ```python
    >>> from transformers import Siglip2VisionConfig, Siglip2VisionModel

    >>> # Initializing a Siglip2VisionConfig with google/siglip2-base-patch16-naflex style configuration
    >>> configuration = Siglip2VisionConfig()

    >>> # Initializing a Siglip2VisionModel (with random weights) from the google/siglip2-base-patch16-naflex style configuration
    >>> model = Siglip2VisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```c                 6    t        |   di | || _        | `y )Nr   )super__init__num_patches
image_size)selfhidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_channelsr(   
patch_size
hidden_actlayer_norm_epsattention_dropoutkwargs	__class__s               r!   r'   zSiglip2VisionConfig.__init__a   s"     	"6"&Or    )
i   i      r6   r         gelu_pytorch_tanhgư>g        )r   r   r   __doc__r'   __classcell__r5   s   @r!   r$   r$   .   s3    0h & r    r$   c                       e Zd Zy)Siglip2ConfigNr   r   r    r!   r>   r>   t   r"   r    r>   c                       e Zd Zy)Siglip2VisionOutputNr   r   r    r!   r@   r@   x   r"   r    r@   c                       e Zd Zy)Siglip2TextOutputNr   r   r    r!   rB   rB   |   r"   r    rB   c                       e Zd Zy)Siglip2OutputNr   r   r    r!   rD   rD      r"   r    rD   c            	            e Zd Zdef fdZedej                  dej                  de	dej                  fd       Z
dej                  dej                  dej                  fd	Z xZS )
Siglip2VisionEmbeddingsconfigc                    t         |           || _        |j                  | _        |j
                  | _        t        j                  |j                  | j
                  z  | j
                  z  | j                        | _	        |j                  | _
        t        | j                  dz        | _        t        j                  | j                  | j                        | _        y )N)in_featuresout_featuresg      ?)r&   r'   rG   r+   	embed_dimr0   nnLinearr/   patch_embeddingr(   intposition_embedding_size	Embeddingposition_embeddingr*   rG   r5   s     r!   r'   z Siglip2VisionEmbeddings.__init__   s    ++ ++!yy++doo=O 

 "--'*4+;+;S+@'A$"$,,t/?/?"Pr    positional_embeddingsspatial_shapes
max_lengthreturnc                 b   |j                   d   }| j                   d   }| j                  }t        j                  |||f| j                  |      }| j                  ddd      j                  d      } | j                  j                  dk(  r| j                  t        j                        } t        |      D ]w  }||   \  }}	t        j                  | ||	fddd	
      }
|
j                  |||	z        j                  dd      }
|
j                  |      }
|
||d||	z  f<   |
d   ||||	z  df<   y |S )ac  
        Resize positional embeddings to image-specific size and pad to a fixed size.

        Args:
            positional_embeddings (`torch.Tensor`):
                Position embeddings of shape (height, width, embed_dim)
            spatial_shapes (`torch.LongTensor`):
                Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
            max_length (`int`):
                Maximum length of the positional embeddings to pad resized positional embeddings to

        Returns:
            `torch.Tensor`: Embeddings of shape (batch_size, max_length, embed_dim)
        r   )devicedtype      cpubilinearFT)sizemodealign_corners	antialiasN)shaper[   torchemptyrZ   permute	unsqueezetypetofloat32rangeFinterpolatereshape	transpose)rT   rU   rV   
batch_sizerK   source_dtyperesulted_positional_embeddingsiheightwidthresized_embeddingss              r!   resize_positional_embeddingsz4Siglip2VisionEmbeddings.resize_positional_embeddings   sc   ( $))!,
)//3	,22).Y/(//*
& !6 = =aA F P PQR S !'',,5$9$<$<U]]$K!z" 	XA*1-MFE!"%e_#" "4!;!;IvPU~!V!`!`abde!f "4!6!6|!DBT*1.>.>+>?BTUVBW*1fun.>+>?%	X( .-r    pixel_valuesc                 J   | j                   j                  j                  }| j                  |j                  |            }| j                  j                  j                  | j                  | j                  d      }| j                  |||j                  d         }||z   }|S )aH  
        Args:
            pixel_values (`torch.FloatTensor`):
                Pixel values of shape (batch_size, max_num_patches, num_channels * patch_size * patch_size)
            spatial_shapes (`List[Tuple[int, int]]`):
                Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
        )r[   rY   r]   )rV   )	rN   weightr[   rj   rR   ro   rP   rx   rd   )r*   ry   rU   target_dtypepatch_embedsrT   resized_positional_embeddings
embeddingss           r!   forwardzSiglip2VisionEmbeddings.forward   s     ++2288++LOO,O,OP !% 7 7 > > F F(($*F*F!
 )-(I(I!>l>P>PQR>S )J )
%
 "$AA
r    )r   r   r   r$   r'   staticmethodre   Tensor
LongTensorrO   rx   FloatTensorr   r;   r<   s   @r!   rF   rF      s    Q2 Q 8.$||8.((8. 8. 
	8. 8.tE$5$5 uGWGW \a\h\h r    rF   c                        e Zd Zdef fdZ	 	 d
dej                  dej                  dej                  de	e
   de	e
   defd	Z xZS )Siglip2VisionTransformerrG   c                 J    t         |           |j                  dk(  | _        y )Nflash_attention_2)r&   r'   _attn_implementation_use_flash_attention_2rS   s     r!   r'   z!Siglip2VisionTransformer.__init__   s"    &,&A&AEX&X#r    ry   attention_maskrU   output_attentionsoutput_hidden_statesrW   c                    ||n| j                   j                  }||n| j                   j                  }| j                  ||      }|#| j                  st        ||j                        }n|}| j                  ||||      }|j                  }	| j                  |	      }	| j                  r| j                  |	|      nd}
t        |	|
|j                  |j                        S )z
        Returns:

        N)inputs_embedsr   r   r   )last_hidden_statepooler_outputhidden_states
attentions)rG   r   r   r   r   r   r[   encoderr   post_layernormuse_headheadr   r   r   )r*   ry   r   rU   r   r   r   encoder_attention_maskencoder_outputsr   r   s              r!   r   z Siglip2VisionTransformer.forward   s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 nE%d.I.I%?P]PcPc%d"%3"+/<<'1/!5	 ,8 ,
 ,== //0ABHL		"3^D[_)/')77&11	
 	
r    NN)r   r   r   r$   r'   re   r   r   r   r   boolr   r   r;   r<   s   @r!   r   r      sq    Y2 Y -1/3*
''*
 *
 ((	*

 $D>*
 'tn*
 
$*
r    r   c                       e Zd Zy)Siglip2PreTrainedModelNr   r   r    r!   r   r     r"   r    r   c                       e Zd Zy)Siglip2TextModelNr   r   r    r!   r   r      r"   r    r   c                   |     e Zd Zdef fdZddej                  deej                     dej                  fdZ xZ	S )$Siglip2MultiheadAttentionPoolingHeadrG   c                 F    t         |   |       |j                  | _        y N)r&   r'   r.   	num_headsrS   s     r!   r'   z-Siglip2MultiheadAttentionPoolingHead.__init__%  s     33r    hidden_stater   rW   c                    |j                   d   }| j                  j                  |dd      }|f|j                   d   |j                   d   }}t        ||j                  |      }|j                  d| j
                  |d      }|j                  d||      }| j                  ||||      d   }|}| j                  |      }|| j                  |      z   }|d d df   S )Nr   r]   rY   )	attn_mask)
rd   proberepeatr   r[   r   ro   	attention	layernormmlp)r*   r   r   rq   r   
target_len
source_lenresiduals           r!   r   z,Siglip2MultiheadAttentionPoolingHead.forward)  s    !''*


!!*a3%%*[[^\5G5G5J
J7HZHZ\fgN+221dnnjRSTN+33B
JON~~e\<Sa~bcde~~l3$((<"88AqD!!r    r   )
r   r   r   r$   r'   re   r   r   r   r;   r<   s   @r!   r   r   $  s>    42 4"ELL "(5<<BX "didpdp "r    r   c                   z    e Zd Z	 	 d	dej                  dej
                  dej                  dee   dee   de	fdZ
y)
Siglip2VisionModelNry   pixel_attention_maskrU   r   r   rW   c                 .    | j                  |||||      S Nry   r   rU   r   r   )vision_model)r*   ry   r   rU   r   r   s         r!   r   zSiglip2VisionModel.forward>  s+       %/)/!5 ! 
 	
r    r   )r   r   r   re   r   r   r   r   r   r   r   r   r    r!   r   r   <  sa     -1/3
''
 $ll
 ((	

 $D>
 'tn
 
$
r    r   c                      e Zd Z	 	 	 	 	 ddeej
                     deej                     deej                     dee   dee   dej
                  fdZ		 	 	 	 	 	 	 	 	 dd	eej                     deej
                     deej                     deej                     d
eej                     deej                     dee   dee   dee   de
fdZy)Siglip2ModelNry   r   rU   r   r   rW   c                     ||n| j                   j                  }||n| j                   j                  }| j                  |||||      }|j                  }|S r   )rG   r   r   r   r   )r*   ry   r   rU   r   r   vision_outputspooled_outputs           r!   get_image_featureszSiglip2Model.get_image_featuresQ  st     2C1N-TXT_T_TqTq$8$D $++JjJj 	 6:5F5F%/)/!5 6G 6
 '44r    	input_idsr   position_idsreturn_lossc
           	         ||n| j                   j                  }|	|	n| j                   j                  }	| j                  |||||	      }
| j	                  |||||	      }|
j
                  }|j
                  }||j                  ddd      z  }||j                  ddd      z  }t        j                  ||j                         j                  |j                              }| j                  j                  |j                        | j                  j                  |j                        }}||j                         z  |z   }|j                         }d }|rt        j                  |j!                  d      |j                        }t        j"                  |       d|z  z   }t        j$                  j&                  j)                  ||z        }t        j*                  |d	       }|j-                         }t/        |||||||

      S )Nr   )r   r   r   r   r   r\   rY   T)pdimkeepdimr   )rZ   r   )losslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_output)rG   r   r   r   
text_modelr   normre   matmultrj   rZ   logit_scale
logit_biasexpeyer`   	ones_likerL   
functional
logsigmoidsummeanrD   )r*   r   ry   r   rU   r   r   r   r   r   r   text_outputsr   r   r   r   r   r   r   r   m1_diag1logliknlls                          r!   r   zSiglip2Model.forwardl  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 6:5F5F%/)/!5 6G 6
 48??)%/!5 4C 4
 &33"00 $l&7&7!T&7&RR!K$4$4qb$$4$OO  ,,{LNN4D4G4GHZHZ4[\"&"2"2"5"5k6H6H"I4??K]K]^i^p^pKqZ)KOO,==
J*,,.))O003O<R<RSC881s7BHXX((33H4NOF99V,,C88:D-+#%* .
 	
r    )NNNNN)	NNNNNNNNN)r   r   r   r   re   r   r   r   r   r   rD   r   r   r    r!   r   r   O  sX    597;59,0/3u001 'u||4 !!1!12	
 $D> 'tn 
		: 15487;591537&*,0/3B
E,,-B
 u001B
 'u||4	B

 !!1!12B
 !.B
 u//0B
 d^B
 $D>B
 'tnB
 
B
r    r   c                       e Zd Z	 	 	 	 	 	 d
deej
                     deej
                     deej                     deej
                     dee   dee   defd	Z	y)Siglip2ForImageClassificationNry   r   rU   labelsr   r   rW   c                 8   ||n| j                   j                  }||n| j                   j                  }| j                  |||||      }|j                  }|Q|d   j                  |j                        }	t        j                  ||	z  d      t        j                  |	d      z  }nt        j                  |d      }| j                  |      }
d }||j                  |
j                        }| j                   j                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt!               }| j                  dk(  r& ||
j#                         |j#                               }n ||
|      }n| j                   j                  dk(  r=t%               } ||
j'                  d| j                        |j'                  d            }n,| j                   j                  dk(  rt)               } ||
|      }t+        ||
|j,                  |j.                  	      S )
N)r   rU   r   r   ).Nr]   r   
regressionsingle_label_classificationmulti_label_classificationrY   )r   logitsr   r   )rG   r   r   r   r   rj   rZ   re   r   r   
classifierproblem_type
num_labelsr[   longrO   r   squeezer   viewr   r   r   r   )r*   ry   r   rU   r   r   r   outputssequence_output	pool_maskr   r   loss_fcts                r!   r   z%Siglip2ForImageClassification.forward  s7    2C1N-TXT_T_TqTq$8$D $++JjJj 	 /3.?.?/)/!5 /@ /
 "33  +,Y7::?;Q;QRI#ii)(CKeiiXaghNiiO#jja@O 1YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./$!//))	
 	
r    )NNNNNN)
r   r   r   r   re   r   r   r   r   r   r   r    r!   r   r     s     047;59)-,0/3@
u||,@
 'u||4@
 !!1!12	@

 &@
 $D>@
 'tn@
 
@
r    r   )r>   r   r$   r   r   r   r   r   )/typingr   re   torch.nnrL   torch.nn.functionalr   rm   r   r   r   /transformers.models.siglip.configuration_siglipr   r   r	   *transformers.models.siglip.modeling_siglipr
   r   r   r   r   r   r   r   r   r   r   r   r   modeling_attn_mask_utilsr   r   r$   r>   r@   rB   rD   ModulerF   r   r   r   r   r   r   r   __all__r   r    r!   <module>r      s        A A n n     C	( 	C, CL	L 		1 		- 		L 	bbii bJ0
6 0
f	2 		 	"+N "0
* 
&_
; _
DB
$@ B
J	r    