
    %	&hD                        d Z ddlmZmZmZmZmZ ddlZddlZddlm	Z	 ddl
mZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ ddlmZmZmZm Z m!Z!m"Z" ddl#m$Z$  ejJ                  e&      Z'dZ( G d de	jR                        Z* ejV                  e*        G d de!      Z,d Z-d&dZ. G d de      Z/ G d de      Z0 G d de	jR                        Z1 G d d e       Z2 G d! d"ee      Z3 G d# d$e      Z4g d%Z5y)'zPyTorch Cohere model.    )CallableListOptionalTupleUnionN)nn   )Cache)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPast)dynamic_rope_update)ALL_ATTENTION_FUNCTIONS)Unpack)ALL_LAYERNORM_LAYERS)
LossKwargslogging   )LlamaAttentionLlamaForCausalLMLlamaMLP
LlamaModelLlamaRotaryEmbeddingeager_attention_forward   )CohereConfigr   c                   &     e Zd Zd fd	Zd Z xZS )CohereLayerNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)zcThe hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dimN)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeepsbias	__class__s       /var/www/pru.catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/cohere/modular_cohere.pyr!   zCohereLayerNorm.__init__8   s/    ll5::k#:; #    c                    |j                   }|j                  t        j                        }|j	                  dd      }||z
  j                  d      j	                  dd      }||z
  t        j                  || j                  z         z  }| j                  j                  t        j                        |z  }|j                  |      S )NT)keepdimr   )	dtypetor#   float32meanpowrsqrtr&   r%   )r'   hidden_statesinput_dtyper4   variances        r,   forwardzCohereLayerNorm.forward>   s    #))%((7!!"d!3!D(--a055b$5G&-XH]H]=]1^^u}}5E,,r-   )Ngh㈵>F)__name__
__module____qualname__r!   r:   __classcell__r+   s   @r,   r   r   7   s    $-r-   r   c                   D    e Zd Z ej                         ed               Zy)CohereRotaryEmbeddingc                 .   | j                   d d d d f   j                         j                  |j                  d   dd      }|d d d d d f   j                         }t	        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  |dd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j                  |j                   
      	j                  |j                   
      fS # 1 sw Y   AxY w)Nr   r/   r   mpscpuF)device_typeenabledr   dimr1   )inv_freqfloatexpandshape
isinstancedevicetypestrr#   autocast	transposerepeat_interleavecosattention_scalingsinr2   r1   )
r'   xposition_idsinv_freq_expandedposition_ids_expandedrE   freqsembrU   rW   s
             r,   r:   zCohereRotaryEmbedding.forwardL   sD    !MM$4-8>>@GGHZHZ[\H]_acde ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	5&,,.1F1L1L1NNYYZ[]^_E))%;C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s   BFFN)r;   r<   r=   r#   no_gradr   r:    r-   r,   rA   rA   K   s$    U]]_<  <r-   rA   c                     | dd d df   }| ddd df   }t        j                  | |gd      j                  d      }|S )N.r   r   r/   rG   )r#   stackflatten)rX   x1x2rot_xs       r,   rotate_halfrg   \   sL    	
3!8B	
319BKK"b	r*2226ELr-   c                 6   | j                   }| j                         } |j                         }|j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }|j	                  |      |j	                  |      fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    rI   )r1   rK   	unsqueezerg   r2   )	qkrU   rW   rY   unsqueeze_dimr1   q_embedk_embeds	            r,   apply_rotary_pos_embro   d   s    ( GGE		A		A
--
&C
--
&C3w;q>C/0G3w;q>C/0G::E:"GJJUJ$;;;r-   c                        e Zd Z fdZ xZS )	CohereMLPc                 J   t         |   |       t        j                  | j                  | j
                  d      | _        t        j                  | j                  | j
                  d      | _        t        j                  | j
                  | j                  d      | _        y )NF)r*   )	r    r!   r   Linearr(   intermediate_size	gate_projup_proj	down_projr'   configr+   s     r,   r!   zCohereMLP.__init__   ss     4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXr-   )r;   r<   r=   r!   r>   r?   s   @r,   rq   rq      s    Y Yr-   rq   c                   >    e Zd ZdZddedee   f fdZ	 	 ddej                  de
ej                  ej                  f   deej                     dee   d	eej                     d
ee   de
ej                  eej                     ee
ej                        f   fdZ xZS )CohereAttentionz=Multi-headed attention from 'Attention Is All You Need' paperry   	layer_idxc                 *   t         |   ||       |j                  | _        | j                  ret        |j                  | j
                  f|j                        | _        t        |j                  | j
                  f|j                        | _	        y y )Nr(   r)   )
r    r!   use_qk_normr   num_attention_headshead_dimlayer_norm_epsq_normnum_key_value_headsk_normr'   ry   r|   r+   s      r,   r!   zCohereAttention.__init__   s|    +!--)#77GVMbMbDK *#77GVMbMbDK r-   r7   position_embeddingsattention_maskpast_key_valuecache_positionkwargsreturnc                    |j                   d d }g |d| j                  }| j                  |      j                  |      }	| j	                  |      j                  |      }
| j                  |      j                  |      }| j                  r"| j                  |	      }	| j                  |
      }
|	j                  dd      }	|
j                  dd      }
|j                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  r^| j                  j                  dk(  r(|j!                  dd      rt"        j%                  d	       nt&        | j                  j                     } || |	|
||f| j(                  sd
n| j*                  | j,                  d|\  }} |j.                  g |d j1                         }| j3                  |      }||fS )Nr/   r   r   )rW   rU   r   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.g        )dropoutscaling)rM   r   q_projviewk_projv_projr   r   r   rS   ro   updater|   r   ry   _attn_implementationgetloggerwarning_oncer   trainingattention_dropoutr   reshape
contiguouso_proj)r'   r7   r   r   r   r   r   input_shapehidden_shapequery_states
key_statesvalue_statesrU   rW   cache_kwargsattention_interfaceattn_outputattn_weightss                     r,   r:   zCohereAttention.forward   s    $))#2.88b8$--8{{=166|D[[/44\B
{{=166|D;;|4LZ0J#--a3))!Q/
#--a3&S#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$J(?;;++w6{{//69fjjI\^c>d##L
 '>dkk>^>^&_#$7	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r-   N)NN)r;   r<   r=   __doc__r   r   intr!   r#   Tensorr   r
   
LongTensorr   r   r:   r>   r?   s   @r,   r{   r{      s    G
| 
 
" +/597)||7) #5<<#=>7) !.	7)
 !7) !!1!127) -.7) 
u||Xell3XeELL>Q5RR	S7)r-   r{   c                   p    e Zd Zdedef fdZ	 	 	 	 	 	 	 ddej                  deej                     deej                     dee
   dee   d	ee   d
eej                     deeej                  ej                  f      dee   deej                  eeej                  ej                  f      f   fdZ xZS )CohereDecoderLayerry   r|   c                     t         |           |j                  | _        t        ||      | _        t        |      | _        t        |j                  |j                        | _	        y )N)ry   r|   r~   )
r    r!   r(   r{   	self_attnrq   mlpr   r   input_layernormr   s      r,   r!   zCohereDecoderLayer.__init__   sR    !--()LV$.F<N<NU[UjUjkr-   r7   r   rY   r   r   	use_cacher   r   r   r   c	                     |}
| j                  |      } | j                  d||||||||d|	\  }}| j                  |      }|
|z   |z   }|f}|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
        )r7   r   rY   r   r   r   r   r   r_   )r   r   r   )r'   r7   r   rY   r   r   r   r   r   r   residualhidden_states_attentionself_attn_weightshidden_states_mlpoutputss                  r,   r:   zCohereDecoderLayer.forward   s    > !,,]; 6DT^^ 
6
')%)/) 3
6
 
6
2!2 !HH]3 !#::=NN ")++Gr-   )NNNFFNN)r;   r<   r=   r   r   r!   r#   r   r   r   r
   boolr   r   r   FloatTensorr:   r>   r?   s   @r,   r   r      s   l| l l 2637*.,1$)59KO:||: !.: u//0	:
 !: $D>: D>: !!1!12: &eELL%,,,F&GH: -.: 
u  (51B1BEDUDU1U+V"WW	X:r-   r   c                   $     e Zd Zdef fdZ xZS )CohereModelry   c           	      &   t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        t        |      | _	        t        |j                  |j                        | _        y c c}w )N)ry   r~   )r    r!   r   
ModuleListrangenum_hidden_layersr   layersrA   
rotary_embr   r(   r   normr   s      r,   r!   zCohereModel.__init__  so     mmDI&JbJbDcdy	2d
 0v>#1C1C&J_J_`	 es   B)r;   r<   r=   r   r!   r>   r?   s   @r,   r   r     s    a| a ar-   r   c                       e Zd Zy)KwargsForCausalLMN)r;   r<   r=   r_   r-   r,   r   r   "  s    r-   r   c                   n    e Zd Z fdZ	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deee	e
ej                     f      deej                     deej                     dee   d	ee   d
ee   deej                     deeej                  f   dee   defdZ xZS )CohereForCausalLMc                     t         |   |       t        |      | _        |j                  | _        |j
                  | _        y r   )r    r!   r   modellogit_scaletie_word_embeddingsrx   s     r,   r!   zCohereForCausalLM.__init__&  s8      (
!--#)#=#= r-   	input_idsr   rY   past_key_valuesinputs_embedslabelsr   r   output_hidden_statesr   logits_to_keepr   r   c                    ||n| j                   j                  }|	|	n| j                   j                  }	 | j                  d||||||||	|
d	|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         }|| j                  z  }d}|* | j                  d||| j                   j                  d|}t        |||j                  |j                  |j                        S )a0  
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

            logits_to_keep (`int` or `torch.Tensor`, *optional*):
                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
                This is useful when using packed tensor format (single dimension for batch and sequence length).

        Returns:

        Example:

        ```python
        >> from transformers import AutoTokenizer, CohereForCausalLM

        >> model = CohereForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01")
        >> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")

        >> prompt = "Hey, are you conscious? Can you talk to me?"
        >> inputs = tokenizer(prompt, return_tensors="pt")

        >> # Generate
        >> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	r   r   rY   r   r   r   r   r   r   )logitsr   
vocab_size)lossr   r   r7   
attentionsr_   )ry   r   r   r   last_hidden_staterN   r   slicelm_headr   loss_functionr   r   r   r7   r   )r'   r   r   rY   r   r   r   r   r   r   r   r   r   r   r7   slice_indicesr   r   s                     r,   r:   zCohereForCausalLM.forward,  s+   \ 2C1N-TXT_T_TqTq$8$D $++JjJj 	
 ,64:: ,
)%+'/!5),
 ,
  118B>SV8W~ot4]kmA}a,?@A$***%4%%pVFt{{OeOepiopD%#33!//))
 	
r-   )NNNNNNNNNNr   )r;   r<   r=   r!   r   r#   r   r   r   r
   r   r   r   r   r   r   r   r:   r>   r?   s   @r,   r   r   %  s8   > 151537KO59-1$(,0/35934Q
E,,-Q
 !.Q
 u//0	Q

 "%tE4E4E/F(F"GHQ
   1 12Q
 ))*Q
 D>Q
 $D>Q
 'tnQ
 !!1!12Q
 c5<</0Q
 *+Q
 
 Q
r-   r   )r   r   CoherePreTrainedModel)Nr   )6r   typingr   r   r   r   r   r#   torch.utils.checkpointr   cache_utilsr
   modeling_flash_attention_utilsr   modeling_outputsr   r   modeling_rope_utilsr   modeling_utilsr   processing_utilsr   pytorch_utilsr   utilsr   r   llama.modeling_llamar   r   r   r   r   r   configuration_coherer   
get_loggerr;   r   _CONFIG_FOR_DOCModuler   appendrA   rg   ro   rq   r{   r   r   r   r   __all__r_   r-   r,   <module>r      s   .  9 9      B O 6 5 & 1 (  / 
		H	% -bii -"    O ,<0 <"<<Y YF)n F)RB BJa* a ?,j >X
( X
vr-   