
    %	&hD,                        d Z ddlmZmZmZmZmZ ddlZddlZddlm	Z	 ddl
mZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZmZmZm Z m!Z!m"Z"m#Z# ddl$m%Z%  ejL                  e'      Z(dZ)dZ* G d de	jV                        Z, G d de      Z- G d de      Z.dZ/ G d de!      Z0 G d de      Z1 G d de      Z2 G d d e       Z3g d!Z4y)"zPyTorch Starcoder2 model.    )CallableListOptionalTupleUnionN)nn   )ACT2FN)CacheDynamicCache)FlashAttentionKwargs)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)%add_start_docstrings_to_model_forwardcan_return_tuplelogging   )MistralAttentionMistralDecoderLayerMistralForCausalLM MistralForSequenceClassificationMistralForTokenClassificationMistralModelapply_rotary_pos_embeager_attention_forward   )Starcoder2Configr   zbigcode/starcoder2-7bc                   h     e Zd Zdef fdZdeeej                        dej                  fdZ	 xZ
S )Starcoder2MLPconfigc                 P   t         |           |j                  }t        j                  ||j
                  |j                        | _        t        j                  |j
                  ||j                        | _        t        |j                     | _        |j                  | _        y N)bias)super__init__hidden_sizer   Linearintermediate_sizeuse_biasc_fcc_projr
   
hidden_actactresidual_dropout)selfr!   	embed_dim	__class__s      /var/www/pru.catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/starcoder2/modular_starcoder2.pyr&   zStarcoder2MLP.__init__9   su    &&	IIi)A)AX	ii 8 8)&//Z&++, & 7 7    hidden_statesreturnc                     | j                  |      }| j                  |      }| j                  |      }t        j                  j                  || j                  | j                        }|S )Nptraining)r+   r.   r,   r   
functionaldropoutr/   r:   )r0   r5   s     r3   forwardzStarcoder2MLP.forwardA   sZ    		-0/M2--mt?T?T_c_l_l-mr4   )__name__
__module____qualname__r   r&   r   r   torchFloatTensorr=   __classcell__r2   s   @r3   r    r    8   s9    8/ 8XeE4E4E.F%G EL]L] r4   r    c                   :    e Zd Zddedee   f fdZ	 	 ddej                  de	ej                  ej                  f   deej                     dee
   deej                     d	ee   d
e	ej                  eej                     ee	ej                        f   fdZ xZS )Starcoder2Attentionr!   	layer_idxc                    t         |           |j                  | _        t        j                  |j
                  |j                  | j                  z  |j                        | _	        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j                  | j                  z  |j
                  |j                        | _        y r#   )r%   r&   r/   r   r(   r'   num_attention_headshead_dimr*   q_projnum_key_value_headsk_projv_projo_projr0   r!   rG   r2   s      r3   r&   zStarcoder2Attention.__init__J   s     & 7 7ii 2 2F4N4NQUQ^Q^4^eketetuii 2 2F4N4NQUQ^Q^4^eketetuii 2 2F4N4NQUQ^Q^4^eketetuii : :T]] JFL^L^eketetur4   r5   position_embeddingsattention_maskpast_key_valuecache_positionkwargsr6   c           
      N   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  r^| j                  j                  dk(  r(|j                  dd      rt        j                  d	       nt         | j                  j                     } || |	|
||f| j"                  sd
n| j$                  | j&                  t)        | j                  dd       d|\  }} |j*                  g |d j-                         }| j/                  |      }t0        j2                  j5                  || j6                  | j"                        }||fS )Nr   r   )sincosrT   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.g        sliding_window)r<   scalingr]   r8   )shaperJ   rK   view	transposerM   rN   r   updaterG   r   r!   _attn_implementationgetloggerwarning_oncer   r:   attention_dropoutr^   getattrreshape
contiguousrO   r   r;   r<   r/   )r0   r5   rQ   rR   rS   rT   rU   input_shapehidden_shapequery_states
key_statesvalue_statesrY   rX   cache_kwargsattention_interfaceattn_outputattn_weightss                     r3   r=   zStarcoder2Attention.forwardR   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$J(?;;++w6{{//69fjjI\^c>d##L
 '>dkk>^>^&_#$7
%
  $}}C$2H2HLL"4;;0@$G
%
 
%
!\ *k));;;;FFHkk+.mm++4004== , 
 L((r4   )N)NN)r>   r?   r@   r   r   intr&   rA   Tensorr   r   
LongTensorr   r   r=   rC   rD   s   @r3   rF   rF   I   s    v/ vHSM v +/594)||4) #5<<#=>4) !.	4)
 !4) !!1!124) -.4) 
u||Xell3XeELL>Q5RR	S4)r4   rF   c                   (     e Zd Zdedef fdZ xZS )Starcoder2DecoderLayerr!   rG   c                 (   t         |   |        t        ||      | _        t	        |      | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y )N)r!   rG   eps)r%   r&   rF   	self_attnr    mlpr   	LayerNormr'   norm_epsiloninput_layernormpost_attention_layernormrP   s      r3   r&   zStarcoder2DecoderLayer.__init__   sh    ,FiP (!||F,>,>FDWDWX(*V5G5GVM`M`(a%r4   )r>   r?   r@   r   rt   r&   rC   rD   s   @r3   rx   rx      s     b/ bC b br4   rx   c                   P    e Zd Zdef fdZe ee      	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     deeeee	j                     f      dee	j                     dee   d	ee   d
ee   dee	j                     dee   defd              Z xZS )Starcoder2Modelr!   c           	      :   t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        t        j                  |j                  |j                        | _        |j                  | _        y c c}w )Nrz   )r%   r&   r   
ModuleListrangenum_hidden_layersrx   layersr~   r'   r   normembedding_dropoutrP   s      r3   r&   zStarcoder2Model.__init__   su     mmHMfNfNfHgh9#FI6h
 LL!3!39L9LM	!'!9!9 is   B	input_idsrR   position_idspast_key_valuesinputs_embeds	use_cacher\   output_hidden_statesrT   flash_attn_kwargsr6   c
                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|| j                  |      }|r|
t               }|	F||j                         nd}t        j                  |||j                  d   z   |j                        }	||	j!                  d      }| j#                  |||	||      }|}t$        j&                  j)                  || j*                  | j                        }| j-                  ||      }|rdnd }|rdnd }| j.                  d | j                   j0                   D ],  }|r||fz  } ||f||||||	|d	|
}|d   }|s$||d   fz  }. | j3                  |      }|r||fz  }t5        ||r|nd ||
      S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r   )devicer8    )rR   r   rS   r\   r   rT   rQ   )last_hidden_stater   r5   
attentions)r!   r\   r   r   
ValueErrorgradient_checkpointingr:   re   rf   embed_tokensr   get_seq_lengthrA   aranger_   r   	unsqueeze_update_causal_maskr   r;   r<   r   
rotary_embr   r   r   r   )r0   r   rR   r   r   r   r   r\   r   rT   r   past_seen_tokenscausal_maskr5   rQ   all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                      r3   r=   zStarcoder2Model.forward   s]    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M0*nO!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L..M>?L]
 &--T33dmm . 

 #oom\J #7BD0d![[)H4;;+H+HI 	6M#!m%55!)
*)."3#-$7
 $
M *!,M =#3"55'	6* 		-0  -!11&+/8Od+%	
 	
r4   )	NNNNNNNNN)r>   r?   r@   r   r&   r   r   STARCODER2_INPUTS_DOCSTRINGr   rA   rv   ru   r   r   r   rB   boolr   r   r   r=   rC   rD   s   @r3   r   r      s*   :/ : *+FG 151537KO59$(,0/359[
E,,-[
 !.[
 u//0	[

 "%tE4E4E/F(F"GH[
   1 12[
 D>[
 $D>[
 'tn[
 !!1!12[
 $$89[
 
![
 H [
r4   r   c                       e Zd Zy)Starcoder2ForCausalLMNr>   r?   r@   r   r4   r3   r   r          r4   r   c                       e Zd Zy)#Starcoder2ForSequenceClassificationNr   r   r4   r3   r   r     r   r4   r   c                       e Zd Zy) Starcoder2ForTokenClassificationNr   r   r4   r3   r   r     r   r4   r   )r   r   Starcoder2PreTrainedModelr   r   )5__doc__typingr   r   r   r   r   rA   torch.utils.checkpointr   activationsr
   cache_utilsr   r   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr   processing_utilsr   utilsr   r   r   mistral.modeling_mistralr   r   r   r   r   r   r   r   configuration_starcoder2r   
get_loggerr>   re   _CONFIG_FOR_DOC_CHECKPOINT_FOR_DOCModuler    rF   rx   r   r   r   r   r   __all__r   r4   r3   <module>r      s   (   9 9    ! . B 6 & U U	 	 	 7 
		H	%$- BII "=)* =)@b0 b # f
l f
R	. 		*J 		'D 	r4   