
    %	&h|                        d dl mZ d dlmZmZmZ d dlZd dlmZ d dl	Zddl
mZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZm Z m!Z!m"Z"m#Z# ddl$m%Z%  ejL                  e'      Z( G d de      Z) G d de!      Z* G d de      Z+ G d deejX                        Z- G d de      Z. G d de       Z/ G d de%      Z0 G d de      Z1g d Z2y)!    )partial)CallableOptionalTupleN   )CacheHybridCache)PretrainedConfig)FlashAttentionKwargs)BaseModelOutputWithPast)rope_config_validation)ALL_ATTENTION_FUNCTIONS)Unpack)logging   )CohereAttentionCohereDecoderLayerCohereForCausalLMCohereLayerNormCoherePreTrainedModelCohereRotaryEmbeddingapply_rotary_pos_embeager_attention_forward)Gemma2Modelc                        e Zd ZdZdZdgZddddddddZdgdgfd	d
gd	gfd	gd	gfdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZ	S )Cohere2Configa  
    This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
    model according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.


    Args:
        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`CohereModel`]
        hidden_size (`int`, *optional*, defaults to 8192):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 22528):
            Dimension of the MLP representations.
        logit_scale (`float`, *optional*, defaults to 0.0625):
            The scaling factor for the output logits.
        num_hidden_layers (`int`, *optional*, defaults to 40):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 64):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 5):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 255001):
            End of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        sliding_window (`int`, *optional*, defaults to 4096):
            Size of the sliding window attention context.
        sliding_window_pattern (`int`, *optional*, defaults to 4):
            Pattern for the sliding window attention.
        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.

    ```python
    >>> from transformers import Cohere2Model, Cohere2Config

    >>> # Initializing a Cohere Nextmodel configuration
    >>> configuration = Cohere2Config()

    >>> # Initializing a model from the Cohere2 configuration
    >>> model = Cohere2Model(configuration) # doctest: +SKIP

    >>> # Accessing the model configuration
    >>> configuration = model.config # doctest: +SKIP
    ```
    cohere2past_key_valuescolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormc                 j   || _         |	| _        || _        || _        || _        || _        || _        ||}|| _        || _        |
| _	        || _
        || _        || _        || _        || _        || _        || _        || _        ||z  | _        || _        t)        |        t+        | X  d||||d| y )N)pad_token_idbos_token_ideos_token_idtie_word_embeddings )
vocab_sizemax_position_embeddingshidden_sizelogit_scaleintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_heads
hidden_actinitializer_rangelayer_norm_eps	use_cache
rope_thetarope_scalingattention_biasattention_dropoutsliding_windowsliding_window_patternhead_dimcache_implementationr   super__init__)selfr.   r0   r2   r1   r3   r4   r5   r6   r/   r7   r8   r9   r)   r*   r+   r,   r:   r;   r<   r=   r>   r?   rA   kwargs	__class__s                            /var/www/pru.catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/cohere2/modular_cohere2.pyrC   zCohere2Config.__init__   s    6 %'>$&&!2!2#6  &"5#6 $!2,"$(,!2,&<##'::$8! 	t$ 	
%%% 3		

 	
    )i      i X  g      ?(   @   NsilurI   g{Gz?gh㈵>Tr      i Tg     @NF        i      hybrid)
__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planrC   __classcell__rF   s   @rG   r   r   3   s    ob J#4"5%.%.%.%."+ )"+ &(9:#%568IJ!"_$56   $  %1?
 ?
rH   r   c                       e Zd Zy)Cohere2RotaryEmbeddingNrQ   rR   rS   r-   rH   rG   r\   r\          rH   r\   c                       e Zd Zy)Cohere2LayerNormNr]   r-   rH   rG   r`   r`      r^   rH   r`   c                   2   e Zd ZdZddedee   fdZ	 	 ddej                  de
ej                  ej                  f   deej                     d	ee   d
eej                     dee   de
ej                  eej                     ee
ej                        f   fdZy)Cohere2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                    t         j                  j                          || _        || _        t        |d|j                  |j                  z        | _        |j                  |j                  z  | _
        | j                  dz  | _        |j                  | _        d| _        t        j                  |j                  |j                  | j                  z  |j                        | _        t        j                  |j                  |j                  | j                  z  |j                        | _        t        j                  |j                  |j                  | j                  z  |j                        | _        t        j                  |j                  | j                  z  |j                  |j                        | _        | j                  dz   | j                  j(                  z  dk7  r|j*                  | _        y d | _        y )Nr@   g      T)bias   r   )nnModulerC   rc   rd   getattrr0   r4   r@   r5   num_key_value_groupsscalingr=   	is_causalLinearr<   q_projk_projv_projo_projr?   r>   )rD   rc   rd   s      rG   rC   zCohere2Attention.__init__  s   
		"
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 '+nnq&8DKK<^<^%^bc%cF!! 	im 	rH   r#   position_embeddingsr$   past_key_valuecache_positionrE   returnc                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}| j                  t        |	|
||      \  }	}
|~||| j                  |d}|j                  |
|| j                  |      \  }
}|J| j                  j                  dk(  r1|j                   d   }|
d d d d d |d d f   |d d d d d |d d f   }}
t        }| j                  j                  dk7  r^| j                  j                  dk(  r(|j                  dd	      rt        j!                  d
       nt"        | j                  j                     } || |	|
||f| j$                  sdn| j&                  | j(                  | j                  d|\  }} |j*                  g |d j-                         }| j/                  |      }||fS )Nrg   r   )sincosr>   ru   flash_attention_2eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.rN   )dropoutrl   r>   )shaper@   ro   view	transposerp   rq   r>   r   updaterd   rc   _attn_implementationr   getloggerwarning_oncer   trainingr=   rl   reshape
contiguousrr   )rD   r#   rs   r$   rt   ru   rE   input_shapehidden_shapequery_states
key_statesvalue_statesrz   ry   cache_kwargsseq_lenattention_interfaceattn_outputattn_weightss                      rG   forwardzCohere2Attention.forward  sb    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S*';L*VY[^'_$L*%"&"5"5"0	L (6'<'<ZW[WeWegs't$J )dkk.N.NRe.e(..r2+5aHWHa6G+H,WXZ[]e^e]eghWhJiL
(?;;++w6{{//69fjjI\^c>d##L
 '>dkk>^>^&_#$7
%
  $}}C$2H2HLL..
%
 
%
!\ *k));;;;FFHkk+.L((rH   N)NN)rQ   rR   rS   rT   r   r   intrC   torchTensorr   r   
LongTensorr   r   r   r-   rH   rG   rb   rb      s    G
} 
# 
> +/59:)||:) #5<<#=>:) !.	:)
 !:) !!1!12:) -.:) 
u||Xell3XeELL>Q5RR	S:)rH   rb   c                   N    e Zd Zdedef fdZ	 	 	 	 	 	 ddej                  deej                  ej                  f   de	ej                     de	e
   de	e   d	e	e   d
e	ej                     dedee   deej                  e	eej                  ej                  f      f   fdZ xZS )Cohere2DecoderLayerrc   rd   c                     t         |   ||       t        ||      | _        || _        |dz   | j                  j
                  z  dk7  | _        |j                  | _        y )Nrg   r   )rB   rC   rb   	self_attnrc   r?   
is_slidingr>   )rD   rc   rd   rF   s      rG   rC   zCohere2DecoderLayer.__init__[  sU    +)&)<$q=DKK,N,NNRSS$33rH   r#   rs   r$   rt   r~   r9   ru   last_cache_positionrE   rv   c	                    | j                   r|t        |j                  d   | j                        }
| j                  j
                  dk(  r|dd|
 df   }nt        j                  |j                        j                  }t        j                  t        j                  |t        j                        | j                         }t        j                  |||      }||
z
  }t        d|      }|dddddd|||
z   f   }|}| j                  |      } | j                  d|||||||d|	\  }}| j!                  |      }||z   |z   }|f}|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            last_cache_position (`int`): equivalent to `cache_position[-1]` but allow indexing without breaking dynamo tracing
        Nr   r{   )dtype)diagonal)r#   rs   r$   rt   r~   r9   ru   r-   )r   maxr   r>   rc   r   r   finfor   mintril	ones_likeboolwhereinput_layernormr   mlp)rD   r#   rs   r$   rt   r~   r9   ru   r   rE   effective_seq_len	min_dtypesliding_window_maskoffsetresidualhidden_states_attentionself_attn_weightshidden_states_mlpoutputss                      rG   r   zCohere2DecoderLayer.forwardb  s   B ??~9 #N$8$8$;T=P=P Q {{//3FF!/4E3E3F0F!G "KK(;(;<@@	&+jjOON%**EQUQdQdPd'# "'-@)^!\ -/@@Q!/1a&K\B\9\0\!] ,,]; 6DT^^ 	6
' 3))/)	6
 	6
2!2 !HH]3 !#::=NN ")++GrH   )NNFFNr   )rQ   rR   rS   r   r   rC   r   r   r   r   r   r   r   r   r   FloatTensorr   rY   rZ   s   @rG   r   r   Z  s   4} 4 4 26*.,1$)59#$R||R #5<<#=>R !.	R
 !R $D>R D>R !!1!12R !R -.R 
u  (51B1BEDUDU1U+V"WW	XRrH   r   c                       e Zd ZeZy)Cohere2PreTrainedModelN)rQ   rR   rS   r   config_classr-   rH   rG   r   r     s     LrH   r   c                       e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee
   deej                     d	ee   d
ee   dee   deej                     dee   dee   defdZ xZS )Cohere2Modelz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Cohere2DecoderLayer`]
    Args:
        config: Cohere2Config
    rc   c                     t         |   |       t        |j                  |j                        | _        t        |      | _        y )N)r0   eps)rc   )rB   rC   r`   r0   r8   r'   r\   
rotary_embrD   rc   rF   s     rG   rC   zCohere2Model.__init__  s6     $&2D2D6K`K`a	0?rH   r!   r$   position_idsr   r"   r9   r~   output_hidden_statesru   r   flash_attn_kwargsrv   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|| j                  |      }|rL|J| j                  s>|j                  \  }}}t        | j                   |||j                  | j                        }|	F||j                         nd}t        j                   |||j                  d   z   |j                        }	||	j#                  d      }|
9d}
|5|j%                         dk(  r|j                  d	   n|	d	   j'                         }
| j)                  |||	||      }|}| j+                  ||      }|rd
nd }|rd
nd }| j,                  D ]r  }|r||fz  }| j
                  r:| j                  r.| j/                  t1        |j2                  fi ||||||||	|
	      }n ||f||||||	|
d|}|d   }|sj||d   fz  }t | j5                  |      }|r||fz  }t7        ||||      S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.F)max_batch_sizemax_cache_lenr   devicer   rg   )r   r   rx   r-   )rs   r$   rt   r~   r9   ru   r   )last_hidden_stater   r#   
attentions)rc   r~   r   r9   
ValueErrorgradient_checkpointingr   r   r   r%   r   r	   r   r   get_seq_lengthr   arange	unsqueezedimitem_update_causal_maskr   r&   _gradient_checkpointing_funcr   __call__r'   r   )rD   r!   r$   r   r   r"   r9   r~   r   ru   r   r   
batch_sizer   _past_seen_tokenscausal_maskr#   rs   all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                          rG   r   zCohere2Model.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M0%2%8%8"J))%#)){{O !CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L &"#) 1?0B0B0D0IN((,~^`OaOfOfOh $ ..M>?L]
 & #oom\J #7BD0d![[  	6M#!m%55!**t}} $ A AM22H6GH!'#%"'
! !.!
!(;#.#2&7'#1(;
! (
! *!,M =#3"55A 	6D 		-0  -!11&+++%	
 	
rH   )
NNNNNNNNNN)rQ   rR   rS   rT   r   rC   r   r   r   r   r	   r   r   r   r   r   r   r   rY   rZ   s   @rG   r   r     s   @} @ 1515371559$(,0/359-1w
E,,-w
 !.w
 u//0	w

 "+.w
   1 12w
 D>w
 $D>w
 'tnw
 !!1!12w
 &c]w
 $$89w
 
!w
rH   r   c                   :     e Zd Zdef fdZ	 	 	 	 	 	 	 ddZ xZS )Cohere2ForCausalLMrc   c                 $    t         |   |       y r   )rB   rC   r   s     rG   rC   zCohere2ForCausalLM.__init__B  s     rH   c	           	         |Y||d   |j                   d   k\  r|d d |j                   d    d f   }n(|j                   d   |j                   d   k7  r	|d d |f   }|t|r|j                         j                  d      dz
  }|j                  |dk(  d       |r9|d d |j                   d    d f   }|j	                  t
        j                        }||d   dk(  r|d d}
n#|j	                  t
        j                        d d}
||j                   d   nd|
d<   t        |t              r|j                  dk(  r| j                  j                  d	k(  s|
d
   #|
d
   j                   \  }}}|
d
   j                  }n!|
d   j                   \  }}|
d   j                  }| j                  j                  |||j                         | j                   j"                  j$                  |||      }|||
d<   |
j'                  |||||d       |
S )Nrx   rg   r   )memory_format)r"   r!   )r!   r"   r   r   r{   r"   r!   )sequence_lengthtarget_lengthr   r   ru   r   logits_to_keep)r   ru   r   r9   r$   )r   longcumsummasked_fill_cloner   contiguous_format
isinstancer	   ndimrc   r   r   model5_prepare_4d_causal_attention_mask_with_cache_positionget_max_cache_shapelm_headweightr   r   )rD   r!   r   r$   r"   ru   r   r9   r   rE   model_inputsr   r   r   r   s                  rG   prepare_inputs_for_generationz0Cohere2ForCausalLM.prepare_inputs_for_generationE  sR   & &)!"%);;%a.*>*>q*A)A)C&CD	#~';';A'>>%a&78	%,*>)..077;a?L%%n&91=+A	0B/B/D,DE  ,11@W@W1X $):a)?-:NL *3uG^G^)_rvwL KYJdn.B.B2.Fjk*+ 4##q(KK448KKO,81=o1N1T1T.
OQ%o6==.:;.G.M.M+
O%k299!ZZ]] /-AACll))//-% ^ N %-;L)* ,"0#2&"0	
 rH   )NNNNNTN)rQ   rR   rS   r   rC   r   rY   rZ   s   @rG   r   r   A  s,    !} ! UrH   r   )r   r   r   r   )3	functoolsr   typingr   r   r   r   torch.nnrh   torch.utils.checkpointcache_utilsr   r	   configuration_utilsr
   modeling_flash_attention_utilsr   modeling_outputsr   modeling_rope_utilsr   modeling_utilsr   processing_utilsr   utilsr   cohere.modeling_coherer   r   r   r   r   r   r   r   gemma2.modeling_gemma2r   
get_loggerrQ   r   r   r\   r`   ri   rb   r   r   r   r   __all__r-   rH   rG   <module>r     s      , ,    - 3 B : 5 &	 	 	 1 
		H	%B
$ B
J	2 		 	W)		 W)tZ, Zz!2 !C
; C
LY* Yx \rH   