
    %	&h/                     D   d dl mZ d dlmZmZmZmZ d dlZd dlZd dlm	Z	 ddl
mZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZmZmZmZ ddlmZ  ej<                  e      Z  G d de      Z! G d de      Z" G d de      Z# G d dee      Z$ G d de      Z%y)    )partial)ListOptionalTupleUnionN)nn   )CacheDynamicCache)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPast)Unpack)
LossKwargslogging   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModel   )GraniteConfigc                   4     e Zd ZdZddedee   f fdZ xZS )GraniteAttentionz=Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                 H    t         |   ||       |j                  | _        y N)super__init__attention_multiplierscalingselfr   r   	__class__s      /var/www/pru.catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/granite/modular_granite.pyr    zGraniteAttention.__init__&   s    +22    r   )	__name__
__module____qualname____doc__r   r   intr    __classcell__r%   s   @r&   r   r   #   s"    G3} 3# 3 3r'   r   c                   f    e Zd Zdedef fdZ	 	 	 	 	 	 	 ddej                  deej                     deej                     dee
   dee   d	ee   d
eej                     deeej                  ej                  f      deej                  eeej                  ej                  f      f   fdZ xZS )GraniteDecoderLayerr   r   c                 l    t         |   ||       |j                  | _        t        ||      | _        y )N)r   r   )r   r    residual_multiplierr   	self_attnr#   s      r&   r    zGraniteDecoderLayer.__init__,   s.    +#)#=#= )9Mr'   hidden_statesattention_maskposition_idspast_key_valueoutput_attentions	use_cachecache_positionposition_embeddingsreturnc	                    |}
| j                  |      } | j                  d||||||||d|	\  }}|
|| j                  z  z   }|}
| j                  |      }| j	                  |      }|
|| j                  z  z   }|f}|r||fz  }|S )a.  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r4   r5   r6   r7   r8   r9   r:   r;    )input_layernormr3   r2   post_attention_layernormmlp)r$   r4   r5   r6   r7   r8   r9   r:   r;   kwargsresidualself_attn_weightsoutputss                r&   forwardzGraniteDecoderLayer.forward1   s    D !,,]; ,:4>> 
,
')%)/) 3
,
 
,
(( !=43K3K#KK !55mD/ =43K3K#KK ")++Gr'   )NNNFFNN)r(   r)   r*   r   r,   r    torchTensorr   
LongTensorr
   boolr   FloatTensorrF   r-   r.   s   @r&   r0   r0   +   s    N} N N 2637*.,1$)59KO?||? !.? u//0	?
 !? $D>? D>? !!1!12? &eELL%,,,F&GH? 
u  (51B1BEDUDU1U+V"WW	X?r'   r0   c                       e Zd Zdef fdZ	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee	   deej                     dee   d	ee   d
ee   deej                     dee   defdZ xZS )GraniteModelr   c           	          t         |   |       |j                  | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        y c c}w r   )	r   r    embedding_multiplierr   
ModuleListrangenum_hidden_layersr0   layersr#   s      r&   r    zGraniteModel.__init__t   sR     $*$?$?!mmEJ6KcKcEde	 3e
es   A(	input_idsr5   r6   past_key_valuesinputs_embedsr9   r8   output_hidden_statesr:   flash_attn_kwargsr<   c
                 0   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|| j                  |      }|| j                  z  }|r|
t               }|	F||j                         nd}t        j                  |||j                  d   z   |j                         }	||	j#                  d      }| j%                  |||	||      }|}| j'                  ||      }|rdnd }|rdnd }| j(                  d | j                   j*                   D ]r  }|r||fz  }| j
                  r:| j                  r.| j-                  t/        |j0                  fi |
|||||||	|	      }n ||f||||||	|d|
}|d   }|sj||d   fz  }t | j3                  |      }|r||fz  }t5        ||r|nd ||	      S )
Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r   )devicer>   )r5   r6   r7   r8   r9   r:   r;   )last_hidden_staterU   r4   
attentions)r   r8   rW   r9   
ValueErrorgradient_checkpointingtrainingloggerwarning_onceembed_tokensrO   r   get_seq_lengthrG   arangeshaperZ   	unsqueeze_update_causal_mask
rotary_embrS   rR   _gradient_checkpointing_funcr   __call__normr   )r$   rT   r5   r6   rU   rV   r9   r8   rW   r:   rX   past_seen_tokenscausal_maskr4   r;   all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                      r&   rF   zGraniteModel.forward{   s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M%(A(AA0*nO!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L..M>?L]
 & #oom\J #7BD0d![[)H4;;+H+HI  	6M#!m%55!**t}} $ A AM22H6GH! #%"'
! !.!
!#.!-#2&7'#1(;
! (
! *!,M =#3"55A 	6D 		-0  -!11&+/8Od+%	
 	
r'   )	NNNNNNNNN)r(   r)   r*   r   r    r   rG   rI   rH   r
   rK   rJ   r   r   r   rF   r-   r.   s   @r&   rM   rM   s   s    
} 
 151537+/59$(,0/359g
E,,-g
 !.g
 u//0	g

 "%g
   1 12g
 D>g
 $D>g
 'tng
 !!1!12g
 $$89g
 
!g
r'   rM   c                       e Zd Zy)KwargsForCausalLMN)r(   r)   r*   r>   r'   r&   rs   rs      s    r'   rs   c                   \   e Zd Z	 	 	 	 	 	 	 	 	 	 	 ddeej
                     deej                     deej
                     deeee	ej                     f      deej                     deej
                     dee   d	ee   d
ee   deej
                     deeej                  f   dee   defdZy)GraniteForCausalLMNrT   r5   r6   rU   rV   labelsr9   r8   rW   r:   logits_to_keeprB   r<   c                    ||n| j                   j                  }|	|	n| j                   j                  }	 | j                  d||||||||	|
d	|}|j                  }t        |t              rt        | d       n|}| j                  |d d |d d f         }|| j                   j                  z  }d }|* | j                  d||| j                   j                  d|}t        |||j                  |j                  |j                        S )N)	rT   r5   r6   rU   rV   r9   r8   rW   r:   )logitsrv   
vocab_size)lossry   rU   r4   r\   r>   )r   r8   rW   modelr[   
isinstancer,   slicelm_headlogits_scalingloss_functionrz   r   rU   r4   r\   )r$   rT   r5   r6   rU   rV   rv   r9   r8   rW   r:   rw   rB   rE   r4   slice_indicesry   r{   s                     r&   rF   zGraniteForCausalLM.forward   s.    2C1N-TXT_T_TqTq$8$D $++JjJj 	
 ,64:: ,
)%+'/!5),
 ,
  118B>SV8W~ot4]kmA}a,?@A$++444%4%%pVFt{{OeOepiopD%#33!//))
 	
r'   )NNNNNNNNNNr   )r(   r)   r*   r   rG   rI   rH   r   r
   r   rK   rJ   r,   r   rs   r   rF   r>   r'   r&   ru   ru      s$    151537KO59-1$(,0/359342
E,,-2
 !.2
 u//0	2

 "%tE4E4E/F(F"GH2
   1 122
 ))*2
 D>2
 $D>2
 'tn2
 !!1!122
 c5<</02
 *+2
 
 2
r'   ru   )&	functoolsr   typingr   r   r   r   rG   torch.utils.checkpointr   cache_utilsr
   r   modeling_flash_attention_utilsr   modeling_outputsr   r   processing_utilsr   utilsr   r   llama.modeling_llamar   r   r   r   configuration_graniter   
get_loggerr(   r`   r   r0   rM   rs   ru   r>   r'   r&   <module>r      s      / /    . B O & ( b b 0 
		H	%3~ 3E+ EPo
: o
d ?,j >3
) 3
r'   