
    %	&h                        d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZ d dlZd dlmZ d dlZddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddl m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7 ddl8m9Z9 ddl:m;Z; dZ<dZ= e(j|                  e?      Z@dZA G d de-      ZB G d de      ZCe G d de             ZD G d dej                        ZF G d de1      ZG G d  d!e4      ZH G d" d#e5      ZI G d$ d%e/      ZJ G d& d'ej                        ZLdZM G d( d)e3      ZN G d* d+e2      ZO G d, d-e0      ZP G d. d/ej                        ZQ G d0 d1e9      ZRg d2ZSy)3    N)Callable)	dataclass)partial)AnyDictListOptionalTupleUnion   )CacheHybridCacheStaticCache)PretrainedConfig)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPastModelOutput)rope_config_validation)ALL_ATTENTION_FUNCTIONS)Unpack)%add_start_docstrings_to_model_forwardcan_return_tupleis_torchdynamo_compilingloggingreplace_return_docstrings)deprecate_kwarg   )Gemma2Config)	Gemma2AttentionGemma2ForCausalLM	Gemma2MLPGemma2ModelGemma2PreTrainedModelGemma2RMSNormGemma2RotaryEmbeddingapply_rotary_pos_embeager_attention_forward)!PaliGemmaForConditionalGeneration)SiglipVisionConfigzgoogle/gemma-3-4bGemma3Configc                   8     e Zd ZdZdZ	 	 	 	 	 	 	 	 d fd	Z xZS )Gemma3TextConfiga!  
    This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Gemma3Text-7B.
    e.g. [google/gemma3_text-7b](https://huggingface.co/google/gemma3_text-7b)
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 262208):
            Vocabulary size of the Gemma3Text model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Gemma3TextModel`]
        hidden_size (`int`, *optional*, defaults to 2304):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 9216):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 26):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 1000000.0):
            The base period of the RoPE embeddings.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
            Scaling factor used on the attention scores
        sliding_window (`int`, *optional*, defaults to 4096): in Gemma3Text, every other layer uses sliding window attention. This is the
            size of the sliding window.
        final_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the logits.
        attn_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the attention scores.
        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings used in gloabl attention. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        rope_local_base_freq (float, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings for local attention.
        sliding_window_pattern (`int`, *optional*, defaults to 6):
            Pattern for the sliding window attention.

    ```python
    >>> from transformers import Gemma3TextModel, Gemma3TextConfig
    >>> # Initializing a Gemma3Text gemma3_text-7b style configuration
    >>> configuration = Gemma3TextConfig()
    >>> # Initializing a model from the gemma3_text-7b style configuration
    >>> model = Gemma3TextModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
        rope_local_base_freq (float, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings for local attention.
        sliding_window_pattern (`int`, *optional*, defaults to 6):
            Pattern for the sliding window attention.
    gemma3_textc	                 f    t        
|   | fi |	 || _        || _        || _        t        |        y N)super__init__rope_local_base_freqsliding_window_patternrope_scalingr   )self
vocab_size
rope_thetar5   r3   r4   max_position_embeddingsfinal_logit_softcappingattn_logit_softcappingsuper_kwargs	__class__s             /var/www/pru.catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/gemma3/modular_gemma3.pyr2   zGemma3TextConfig.__init__   s7     	..$8!&<#(t$    )i@  g    .ANg     @   i   NN)__name__
__module____qualname____doc__
model_typer2   __classcell__r=   s   @r>   r-   r-   E   s5    wr J %  ' $#% %r?   r-   c                        e Zd ZdZdZeedZ	 	 	 	 	 	 	 ddee	ee
eef   f      dee	ee
eef   f      dededed	ed
ef fdZ xZS )r+   a  
    This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an
    Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the PaliGemma-2B.

    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        text_config (`Union[Gemma3TextConfig, dict]`, *optional*):
            The config object of the text backbone.
        vision_config (`Union[AutoConfig, dict]`,  *optional*):
            Custom vision config or dict.
        mm_tokens_per_image (`int`, *optional*, defaults to 256):
            The number of tokens per image embedding.
        boi_token_index (`int`, *optional*, defaults to 255999):
            The begin-of-image token index to wrap the image prompt.
        eoi_token_index (`int`, *optional*, defaults to 256000):
            The end-of-image token index to wrap the image prompt.
        image_token_index (`int`, *optional*, defaults to 262144):
            The image token index to encode the image prompt.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.


    Example:

    ```python
    >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig

    >>> # Initializing a Siglip-like vision config
    >>> vision_config = SiglipVisionConfig()

    >>> # Initializing a Gemma3 Text config
    >>> text_config = Gemma3TextConfig()

    >>> # Initializing a Gemma3 gemma-3-4b style configuration
    >>> configuration = Gemma3Config(vision_config, text_config)

    >>> # Initializing a model from the gemma-3-4b style configuration
    >>> model = Gemma3TextConfig(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```gemma3)text_configvision_configrJ   rK   mm_tokens_per_imageboi_token_indexeoi_token_indeximage_token_indexinitializer_rangec                 z   | t               }t        j                  d       nt        |t              rt        di |}t        |t              rt        di |}n!|t               }t        j                  d       || _        || _        || _        || _	        || _
        || _        || _        t        	| 8  di | y )Nz@text_config is None, using default Gemma3TextConfig text config.zFvision_config is None, using default SiglipVisionConfig vision config. )r-   loggerinfo
isinstancedictr*   rJ   rK   rL   rM   rN   rO   rP   r1   r2   )
r6   rJ   rK   rL   rM   rN   rO   rP   kwargsr=   s
            r>   r2   zGemma3Config.__init__  s     *,KKKZ[T**9[9KmT*.??M".0MKK`a&*#6 ..!2!2"6"r?   )NN   i i  i   g{Gz?)rA   rB   rC   rD   rE   r-   r*   sub_configsr	   r   r   strr   intfloatr2   rF   rG   s   @r>   r+   r+      s    .` J'+K JNMQ#&&&!(#'#e$4d38n$DEF#  &8$sCx.&H IJ# !	#
 # # # !# #r?   c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeeej                     ef      ed<   dZeeej                        ed<   dZeeej                        ed<   dZeej                     ed<   y)	Gemma3CausalLMOutputWithPasta  
    Base class for Gemma3 causal language model (or autoregressive) outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`torch.FloatTensor`, *optional*):
            A `torch.FloatTensor` of size `(batch_size, sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsimage_hidden_states)rA   rB   rC   rD   r_   r	   torchFloatTensor__annotations__r`   ra   r   r   r   rb   r
   rc   rd   rR   r?   r>   r^   r^   /  s    < )-D(5$$
%,*.FHU&&'.GKOXeD):):$;U$BCDK8<M8E%"3"345<59Ju001297;%"3"34;r?   r^   c            	       Z     e Zd ZdZd	dedededef fdZdej                  f fdZ	 xZ
S )
Gemma3TextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    num_embeddingsembedding_dimpadding_idxembed_scalec                 v    t         |   |||       | j                  dt        j                  |      d       y )Nrm   F)
persistent)r1   r2   register_bufferre   tensor)r6   rj   rk   rl   rm   r=   s        r>   r2   z&Gemma3TextScaledWordEmbedding.__init__\  s3    D]ELL,ERWXr?   	input_idsc                     t         |   |      | j                  j                  | j                  j
                        z  S r0   )r1   forwardrm   toweightdtype)r6   rr   r=   s     r>   rt   z%Gemma3TextScaledWordEmbedding.forward`  s2    wy)D,<,<,?,?@Q@Q,RRRr?   )g      ?)rA   rB   rC   rD   r[   r\   r2   re   Tensorrt   rF   rG   s   @r>   ri   ri   W  sG    Ys Y3 YS Y_d YS S Sr?   ri   c                   $     e Zd Zdef fdZ xZS )	Gemma3MLPconfigc                 $    t         |   |       y r0   r1   r2   r6   r{   r=   s     r>   r2   zGemma3MLP.__init__e       r?   rA   rB   rC   r-   r2   rF   rG   s   @r>   rz   rz   d  s    !/ ! !r?   rz   c                   *     e Zd Zddedef fdZ xZS )Gemma3RMSNormdimepsc                 "    t         |           y r0   r}   )r6   r   r   r=   s      r>   r2   zGemma3RMSNorm.__init__j  s    r?   )gư>)rA   rB   rC   r[   r\   r2   rF   rG   s   @r>   r   r   i  s    C e  r?   r   c                   &     e Zd Zddef fdZ xZS )Gemma3RotaryEmbeddingr{   c                 $    t         |   |       y r0   r}   )r6   r{   devicer=   s      r>   r2   zGemma3RotaryEmbedding.__init__o  r   r?   r0   r   rG   s   @r>   r   r   n  s    !/ ! !r?   r   c                       e Zd Zdedef fdZ	 	 ddej                  dej                  deej                     dee	   deej                     d	ee   d
eej                  eej                     eeej                        f   fdZ xZS )Gemma3Attentionr{   	layer_idxc                 8   t        |dz   |j                  z        | _        t        |           | j                  r|j
                  nd | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        y )N   )r   r   )boolr4   
is_slidingr1   r2   sliding_windowr   head_dimrms_norm_epsq_normk_normr6   r{   r   r=   s      r>   r2   zGemma3Attention.__init__u  so    	A1N1NNO7;f33D#V=P=PQ#V=P=PQr?   rb   position_embeddingsattention_maskpast_key_valuecache_positionrW   returnc                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }| j                  |	      }	| j                  |
      }
|\  }}t        |	|
||      \  }	}
|~|||| j                  d}|j                  |
|| j                  |      \  }
}|J| j                  j                  dk(  r1|j                   d   }|
d d d d d |d d f   |d d d d d |d d f   }}
t        }| j                  j                  dk7  r^| j                  j                  dk(  r(|j!                  dd	      rt"        j%                  d
       nt&        | j                  j                     }||j)                  |	      } || |	|
||f| j*                  r| j,                  nd| j.                  | j                  d|\  }} |j0                  g |d j3                         }| j5                  |      }||fS )Nr   r   )sincosr   r   flash_attention_2eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )dropoutscalingr   )shaper   q_projview	transposek_projv_projr   r   r'   r   updater   r{   _attn_implementationr(   getrS   warning_oncer   ru   trainingattention_dropoutr   reshape
contiguouso_proj)r6   rb   r   r   r   r   rW   input_shapehidden_shapequery_states
key_statesvalue_statesr   r   cache_kwargsseq_lenattention_interfaceattn_outputattn_weightss                      r>   rt   zGemma3Attention.forward~  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&S#7jRUWZ#[ j% "0"&"5"5	L (6'<'<ZW[WeWegs't$J )dkk.N.NRe.e(..r2+5aHWHa6G+H,WXZ[]e^e]eghWhJiL
(?;;++w6{{//69fjjI\^c>d##L '>dkk>^>^&_#%+..|<N$7
%
 /3mmD**LL..
%
 
%
!\ *k));;;;FFHkk+.L((r?   )NN)rA   rB   rC   r-   r[   r2   re   rx   r	   r   
LongTensorr   r   tuplert   rF   rG   s   @r>   r   r   t  s    R/ RC R +/59@)||@) #\\@) !.	@)
 !@) !!1!12@) -.@) 
u||Xell3XeELL>Q5RR	S@)r?   r   c                   ^    e Zd Zdedef fdZ	 	 	 	 	 	 	 ddej                  dej                  dej                  deej                     deej                     d	ee
   d
ee   dee   deej                     dedeej                  eeej                  ej                  f      f   fdZ xZS )Gemma3DecoderLayerr{   r   c                 ,   t         |           || _        |j                  | _        || _        t        ||      | _        t        |      | _        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        | j                  j                  | _        |j                   | _        y )N)r{   r   r   )r1   r2   r{   hidden_sizer   r   	self_attnrz   mlpr   r   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   r   r   s      r>   r2   zGemma3DecoderLayer.__init__  s    !--"()LV$,T-=-=6CVCVW(5d6F6FFL_L_(`%)6t7G7GVM`M`)a&*78H8HfNaNa*b'..33$33r?   rb   position_embeddings_globalposition_embeddings_localr   position_idsr   r   	use_cacher   last_cache_positionr   c                 V   | j                   r|t        |	j                  d   | j                        }| j                  j
                  dk(  r|d d | d f   }nt        j                  |j                        j                  }t        j                  t        j                  |t        j                        | j                         }t        j                  |||      }|
|z
  }t        d|      }|d d d d d d |||z   f   }|}| j                  |      }| j                  j                   r|}n|} | j                  d||||||||	d|\  }}| j!                  |      }||z   }|}| j#                  |      }| j%                  |      }| j'                  |      }||z   }|f}|r||fz  }|S )Nr   r   rw   diagonal)rb   r   r   r   r   r   r   r   rR   )r   maxr   r   r{   r   re   finforw   mintril	ones_liker   wherer   r   r   r   r   r   )r6   rb   r   r   r   r   r   r   r   r   r   rW   effective_seq_len	min_dtypesliding_window_maskoffsetresidualr   self_attn_weightsoutputss                       r>   rt   zGemma3DecoderLayer.forward  s    ??~9 #N$8$8$;T=P=P Q {{//3FF!/4E3E3F0F!G "KK(<(<=AA	&+jjOON%**EQUQdQdPd'# "'-@)^!\ -/@@Q!/1a&K\B\9\0\!] ,,]; >>$$";"<+94>> 
,
' 3)%)/)
,
 
,
(( 55mD =0 66}E/77F =0 ")++Gr?   )NNNFFNr   )rA   rB   rC   r-   r[   r2   re   rx   r	   r   r   r   r   rf   rt   rF   rG   s   @r>   r   r     s   4/ 4C 4& 2637*.,1$)59#$G||G %*LLG $)<<	G
 !.G u//0G !G $D>G D>G !!1!12G !G 
u  (51B1BEDUDU1U+V"WW	XGr?   r   c                       e Zd ZdZg dZd Zy)Gemma3PreTrainedModellanguage_model)r   SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadc                    t        | j                  d      r| j                  j                  n| j                  j                  j                  }t	        |t
        j                  t
        j                  f      rY|j                  j                  j                  d|       |j                  %|j                  j                  j                          y y t	        |t
        j                        rf|j                  j                  j                  d|       |j                  2|j                  j                  |j                     j                          y y y )NrP   r   )meanstd)hasattrr{   rP   rJ   rU   nnLinearConv2drv   datanormal_biaszero_	Embeddingrl   )r6   moduler   s      r>   _init_weightsz#Gemma3PreTrainedModel._init_weights&  s    
 t{{$78 KK))((:: 	 fryy"))45MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> . .r?   N)rA   rB   rC   base_model_prefix_no_split_modulesr   rR   r?   r>   r   r     s    (?r?   r   c                       e Zd ZeZdef fdZ	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee
   deej                     dee   d	ee   d
ee   deej                     dee   dee   defdZ xZS )Gemma3TextModelr{   c                 6   t         |   |       t        |j                  |j                  | j
                  | j                  j                  dz        | _        t        j                  |      }|j                  |_        ddi|_        t        |      | _        y )N      ?)rm   	rope_typedefault)r{   )r1   r2   ri   r7   r   rl   r{   embed_tokenscopydeepcopyr3   r8   r5   r   rotary_emb_localr~   s     r>   r2   zGemma3TextModel.__init__<  s      :v1143C3CQUQ\Q\QhQhjmQm
 v&"77*I6 5V Dr?   rr   r   r   ra   inputs_embedsr   r   output_hidden_statesr   r   flash_attn_kwargsr   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|| j                  |      }|rA|?| j                  s3|j                  \  }}}t        | j                   |||j                        }|	F||j                         nd}t        j                  |||j                  d   z   |j                         }	||	j#                  d      }|
9d}
|5|j%                         dk(  r|j                  d	   n|	d	   j'                         }
| j)                  |||	||      }|}| j+                  ||      }| j-                  ||      }|rd
nd }|rd
nd }| j.                  d | j                   j0                   D ]v  }|r||fz  }| j
                  r<| j                  r0| j3                  t5        |j6                  fi ||||||||||	|
      }n ||f||||||||	|
d	|}|d   }|sn||d   fz  }x | j9                  |      }|r||fz  }t;        ||||      S )N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.F)max_batch_sizemax_cache_lenrw   r   r   r   r   r   rR   )	r   r   r   r   r   r   r   r   r   )last_hidden_statera   rb   rc   )r{   r   r  r   
ValueErrorgradient_checkpointingr   rS   r   r   r   r   rw   get_seq_lengthre   aranger   	unsqueezer   item_update_causal_mask
rotary_embr  layersnum_hidden_layers_gradient_checkpointing_funcr   __call__normr   )r6   rr   r   r   ra   r  r   r   r  r   r   r  
batch_sizer   _past_seen_tokenscausal_maskrb   r   r   all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                           r>   rt   zGemma3TextModel.forwardK  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M0%2%8%8"J))%#))	O !CRC^==?de"\\  =#6#6q#99$++N )33A6L &"#) 1?0B0B0D0IN((,~^`OaOfOfOh $ ..
 & &*__]L%Q"$($9$9-$V! #7BD0d![[)H4;;+H+HI $	6M#!m%55!**t}} $ A AM22H6GH!.- #%"'! !.!!/I.G#.!-#2&7'#1(;! (! *!,M =#3"55I$	6L 		-0-!11&+++%	
 	
r?   )
NNNNNNNNNN)rA   rB   rC   r-   config_classr2   r	   re   r   rx   r   rf   r   r[   r   r   r   rt   rF   rG   s   @r>   r   r   9  s   #LE/ E" 1515371559$(,0/359-1A
E,,-A
 !.A
 u//0	A

 "+.A
   1 12A
 D>A
 $D>A
 'tnA
 !!1!12A
 &c]A
 $$89A
 
!A
r?   r   c                   ,     e Zd ZeZdZdef fdZ xZS )Gemma3ForCausalLMr   r{   c                 D    t         |   |       t        |      | _        y r0   )r1   r2   r   modelr~   s     r>   r2   zGemma3ForCausalLM.__init__  s     $V,
r?   )rA   rB   rC   r-   r!  r   r2   rF   rG   s   @r>   r#  r#    s     #L(-/ - -r?   r#  c                   D     e Zd Zdef fdZdej                  fdZ xZS )Gemma3MultiModalProjectorr{   c                    t         |           t        j                  t	        j
                  |j                  j                  |j                  j                              | _	        t        |j                  j                  |j                  j                        | _        t        |j                  j                  |j                  j                  z        | _        t        |j"                  dz        | _        | j                   | j$                  z  | _        t        j(                  | j&                  | j&                        | _        y )Nr   r   )kernel_sizestride)r1   r2   r   	Parameterre   zerosrK   r   rJ   mm_input_projection_weightr   layer_norm_epsmm_soft_emb_normr[   
image_size
patch_sizepatches_per_imagerL   tokens_per_sider)  	AvgPool2davg_poolr~   s     r>   r2   z"Gemma3MultiModalProjector.__init__  s    *,,,KK,,88&:L:L:X:XY+
' !.  ,,&2F2F2U2U!
 "%V%9%9%D%DH\H\HgHg%g!h"6#=#=s#BC11T5I5II1A1A$JZJZ[r?   vision_outputsc                    |j                   \  }}}|j                  dd      }|j                  ||| j                  | j                        }|j	                         }| j                  |      }|j                  d      }|j                  dd      }| j                  |      }t        j                  || j                        }|j                  |      S )Nr   r   )r   r   r   r2  r   r5  flattenr/  re   matmulr-  type_as)	r6   r6  r  r  
seq_lengthreshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputss	            r>   rt   z!Gemma3MultiModalProjector.forward  s    $2$8$8!
Az"0":":1a"@"9"A"A
D$:$:D<R<R#
 #:"D"D"F $.E F 5 = =a @ 5 ? ?1 E $ 5 56K L#(<<0EtGfGf#g '//??r?   )	rA   rB   rC   r+   r2   re   rx   rt   rF   rG   s   @r>   r'  r'    s#    \| \ @ell @r?   r'  c            #       \   e Zd Zd Zdej
                  dej
                  fdZ	 ddefdZe	 e
ddd	
       ee       eee      	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej"                     deej$                     deej
                     deej"                     deeeej$                     ef      deej"                     deej"                     deej$                     deej"                     dee   dee   dee   d	eeej
                  f   deeef   fd                            Z	 	 	 	 	 	 	 	 	 	 ddZy)Gemma3ForConditionalGenerationc                 6    | j                   j                         S r0   )r   tie_weights)r6   s    r>   rC  z*Gemma3ForConditionalGeneration.tie_weights  s    ""..00r?   pixel_valuesr   c                 `    | j                  |      j                  }| j                  |      }|S )a  
        Projects the last hidden state from the vision model into language model space.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        )rD  )vision_towerr  multi_modal_projector)r6   rD  r6  image_featuress       r>   get_image_featuresz1Gemma3ForConditionalGeneration.get_image_features   s3     ***EWW33NCr?   is_trainingc                    | j                   j                  j                  dk(  r|S ||j                         dk(  r|S t	        |t
              }t        j                  | j                        j                  }|j                  d d \  }	}
|r|j                         }nUt	        |t              r|j                         }n4t	        |t        j                        r|j                  d   n
|d   |
z   dz   }||j                         dk(  r|S t        j                  |
|f|| j                  |j                        }|
dk7  rt        j                   |d      }|t        j"                  ||j                  	      |j%                  dd      kD  z  }|d d d d d d f   j'                  |	ddd      }||
dk7  r|j)                  d      |j)                  d      k(  }d
||dk(  <   |j)                  d      j+                  |j                  t        j,                        }|j/                         }|d d d d d d d |
f   j1                  |d      |d d d d d d d |
f<   ||j/                         }|j                  d   }|d d d d d d d |f   |d d d d d d f   j+                  |j                        z   }|dk(  }|d d d d d d d |f   j1                  ||      |d d d d d d d |f<   |S )Nr      r   r   r   r   )
fill_valuerw   r   r   r
  Fr   r   )r{   rJ   r   r   rU   r   re   r   rw   r   r   get_max_cache_shaper   rx   fullr   triur  r   expandr  ru   r   clonemasked_fill)r6   r   token_type_idsra   r   input_tensorrJ  using_static_cacher   inputs_lead_dimsequence_lengthtarget_lengthr  token_type_maskmask_lengthpadding_masks                   r>   r  z2Gemma3ForConditionalGeneration._update_causal_mask  s    ;;""77;NN!!%.*<*<*>!*C "!'EKK

+//	+7+=+=bq+A(+??AM5+??AM nell; $$R(#A&81<  %.*<*<*>!*C!!jjm,$**]k]r]r

 a**[1=Ku||M.:O:OPSaSiSijlnoSppp!$a"23::?ArSUV %/Q*>,66q9^=U=UVW=XXO38ONa/0-77:==k>P>PX]XbXb=cO%++-K5@AqJZ?JZAZ5[5g5g6K1a!1/!112 %%++-K(..r2K 'q!Q'<=qRVX\^_O_@`@c@cdodvdv@wwL'1,L1<Q1l{l=R1S1_1_i2K1a+-. r?   num_logits_to_keepz4.50logits_to_keep)versionnew_name)output_typer!  Nrr   r   r   ra   rT  r   r  labelsr   r   r  c                 \	   |du |duz  rt        d      ||n| j                  j                  }||n| j                  j                  }|duxr |	du}|R| j                  j                  | j
                  k\  r/|| j                  j                  k(  }|j                         }d||<   n|}| | j                         |      }|F||j                         nd}t        j                  |||j                  d   z   |j                        }|{| j                  |      }|\| | j                         t        j                  | j                  j                  t        j                  |j                              k(  }nR|| j                  j                  k(  j!                  d      }|j#                  |      j%                  |j                        }t'               sx||   j)                         |j)                         k7  rT|j+                  d      j+                  d      d   }t        d	| d
|j                  d   |j                  d   z   d      |j%                  |j                  |j,                        }|j/                  ||      }|	[| j0                  |	v rMt2        j5                  d       t        j6                  || j0                  k(  | j                  j8                  |	      }	| j;                  ||||||      } | j<                  d|||||
||||d	|}|j>                  }d}|	O|jA                         }|dddddf   }|	dddf   }||dd|j                  d    df   j%                  |j                        }||j%                  |j                        dk7     jC                         }||j%                  |j                        dk7     jC                         }n |jC                         }|jC                         }tE        jF                         }|jI                  d| j                  jJ                  j
                        }|jI                  d      j%                  |j                        } |||      }tM        |||jN                  |jP                  |jR                  |      S d      S )a
  
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

            logits_to_keep (`int` or `torch.Tensor`, *optional*):
                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
                This is useful when using packed tensor format (single dimension for batch and sequence length).

        Returns:

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenizer=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        Nr  r   r   r
  )rw   r   r   )r   zVNumber of images does not match number of special image tokens in the input text. Got z image tokens in the text but z tokens from image embeddings.z`labels` contains `pad_token_id` which will be masked with `config.ignore_index`. You have to mask out `pad_token_id` when preparing `labels`, this behavior will be removed in v.4.46.)	r   r   ra   r  r   r   r  r   r^  .)r_   r`   ra   rb   rc   rd   rR   )*r  r{   r   r  rO   r7   rR  get_input_embeddingsr  re   r  r   r   rI  rq   longr  	expand_asru   r   numelsumrw   masked_scatterpad_token_idrS   r   r   ignore_indexr  r   r`   r\   r   r   CrossEntropyLossr   rJ   r^   ra   rb   rc   )r6   rr   rD  r   r   ra   rT  r   r  rb  r   r   r  r^  	lm_kwargsrJ  special_image_maskllm_input_idsr  rH  image_tokens_in_textr  r   r`   r_   shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelss                                  r>   rt   z&Gemma3ForConditionalGeneration.forwardS  s   V -t";<YZZ1B1N-TXT_T_TqTq$8$D $++JjJj 	 %D0GV45G  T[[%B%Bdoo%U!*dkk.K.K!K%OO-M01M,-%M 7D557FM!CRC^==?de"\\ "2]5H5H5K"KTaThThN
 #!44\BN %26Qd6O6O6QLL!>!>ejjYfYmYmn7 &" '04;;3P3P&P%[%[\^%_"%7%A%A-%P%S%STaThTh%i"+--@R2S2Y2Y2[_m_s_s_u2u(:'?'?A'?'F'J'Jq'J'QRS'T$ /00N~OcOcdeOfiwi}i}~  jA  PA  OB B44 
 ,..}/C/C]EXEXYN)889K^\M $"3"3v"=x [[d.?.?!?AYAY[abF..NO^]\g
 +>$*=*= +
&%+'/!5))+
 +
 \\^F!#ssA+.L!#qr'?L) (6a,:L:LQ:O9O9Q6Q'R'U'UV\VcVc'd$+,@,C,CFMM,RVW,WXcce+,@,C,CLDWDW,X\],]^iik+668+668**,H&++B0G0G0R0RSK&++B/22<3F3FGKK5D+#33!//))2>2J
 	
 QU
 	
r?   c                      | j                   j                  |f||||||	|
|d|}|d   dk(  r||d<   |d uxr |d u}|d   dk(  r1t        |t              r!||n|}| j	                  ||||||      }||d<   |S )N)ra   r  r   r   r   r   r^  rT  r   rD  r   )r   prepare_inputs_for_generationrU   r   r  )r6   rr   ra   r  r   r   rD  r   rT  r   r^  rb  rW   model_inputsrJ  rU  r  s                    r>   rx  z<Gemma3ForConditionalGeneration.prepare_inputs_for_generation  s      It**HH
+')%)))
 
 !!+7L($D0GV45G!!j+&N,9,E=9L22Q]_jK .9L)*r?   )F)NNNNNNNNNNNNr   )
NNNNNNNTNN)rA   rB   rC   rC  re   rx   rI  r   r  r   r   r   GEMMA3_INPUTS_DOCSTRINGr   r^   _CONFIG_FOR_DOCr	   r   rf   r   r   r   r[   r
   rt   rx  rR   r?   r>   rA  rA    s    1u||  * "C CJ )6DTU*+BC+GVef 15481537KO595959-1$(,0/334n
E,,-n
 u001n
 !.	n

 u//0n
 "%U->->(?(F"GHn
 !!1!12n
 !!1!12n
   1 12n
 ))*n
 D>n
 $D>n
 'tnn
 c5<</0n
  
u22	3!n
 g D V n
f )r?   rA  )r+   r-   r   r   r#  rA  )Tr   collections.abcr   dataclassesr   	functoolsr   typingr   r   r   r	   r
   r   re   torch.nnr   torch.utils.checkpointcache_utilsr   r   r   configuration_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   r   r   modeling_rope_utilsr   modeling_utilsr   processing_utilsr   utilsr   r   r   r   r   utils.deprecationr   gemma2.configuration_gemma2r   gemma2.modeling_gemma2r    r!   r"   r#   r$   r%   r&   r'   r(   paligemma.modeling_paligemmar)   siglipr*   _CHECKPOINT_FOR_DOCr{  
get_loggerrA   rS   rz  r-   r+   r^   r   ri   rz   r   r   r   Moduler   GEMMA3_START_DOCSTRINGr   r   r#  r'  rA  __all__rR   r?   r>   <module>r     s     $ !  : :    : : 3 B 
 : 5 &  1 6
 
 
 M ' *  			H	% N%| N%bV## V#r $<; $< $<N
SBLL 
S!	 !
M 
!1 !J)o J)ZV Vr  ?1 ?8S
k S
l-) -!@		 !@Ht%F tn	r?   