
    %	&h=                        d dl mZmZmZmZmZ d dlZd dlZd dlmZ ddl	m
Z
mZmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&  ejN                  e(      Z)dZ* G d de!      Z+ G d de      Z, G d de      Z- G d de"      Z. G d de      Z/ G d de       Z0 G d de      Z1 G d d e      Z2y)!    )CallableListOptionalTupleUnionN)nn   )CacheSlidingWindowCacheStaticCache)AttentionMaskConverter)FlashAttentionKwargs)BaseModelOutputWithPastQuestionAnsweringModelOutput)ALL_ATTENTION_FUNCTIONS)Unpack)logging   )
LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaForQuestionAnsweringLlamaForSequenceClassificationLlamaForTokenClassificationLlamaMLP
LlamaModelapply_rotary_pos_embeager_attention_forward   )MistralConfigzmistralai/Mistral-7B-v0.1c                        e Zd Z fdZ xZS )
MistralMLPc                 J   t         |   |       t        j                  | j                  | j
                  d      | _        t        j                  | j                  | j
                  d      | _        t        j                  | j
                  | j                  d      | _        y NF)bias)	super__init__r   Linearhidden_sizeintermediate_size	gate_projup_proj	down_projselfconfig	__class__s     /var/www/pru.catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/mistral/modular_mistral.pyr'   zMistralMLP.__init__#   ss     4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWX    )__name__
__module____qualname__r'   __classcell__r1   s   @r2   r"   r"   "   s    Y Yr3   r"   c                   2    e Zd Zdedef fdZ	 	 ddej                  deej                  ej                  f   de	ej                     de	e
   de	ej                     d	ee   d
eej                  e	ej                     e	eej                        f   fdZ xZS )MistralAttentionr0   	layer_idxc                    t         |           t        j                  |j                  |j
                  | j                  z  d      | _        t        j                  |j                  |j                  | j                  z  d      | _	        t        j                  |j                  |j                  | j                  z  d      | _
        t        j                  |j
                  | j                  z  |j                  d      | _        y r$   )r&   r'   r   r(   r)   num_attention_headshead_dimq_projnum_key_value_headsk_projv_projo_projr/   r0   r;   r1   s      r2   r'   zMistralAttention.__init__+   s    ii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii : :T]] JFL^L^ejkr3   hidden_statesposition_embeddingsattention_maskpast_key_valuecache_positionkwargsreturnc           
         |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  r^| j                  j                  dk(  r(|j                  dd      rt        j                  d	       nt         | j                  j                     } || |	|
||f| j"                  sd
n| j$                  | j&                  t)        | j                  dd       d|\  }} |j*                  g |d j-                         }| j/                  |      }||fS )Nr   r   )sincosrI   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        sliding_window)dropoutscalingrT   )shaper>   r?   view	transposerA   rB   r   updater;   r   r0   _attn_implementationgetloggerwarning_oncer   trainingattention_dropoutrV   getattrreshape
contiguousrC   )r/   rE   rF   rG   rH   rI   rJ   input_shapehidden_shapequery_states
key_statesvalue_statesrO   rN   cache_kwargsattention_interfaceattn_outputattn_weightss                     r2   forwardzMistralAttention.forward2   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$J(?;;++w6{{//69fjjI\^c>d##L
 '>dkk>^>^&_#$7
%
  $}}C$2H2HLL"4;;0@$G
%
 
%
!\ *k));;;;FFHkk+.L((r3   )NN)r4   r5   r6   r    intr'   torchTensorr   r   r
   
LongTensorr   r   rm   r7   r8   s   @r2   r:   r:   *   s    l} l l +/590)||0) #5<<#=>0) !.	0)
 !0) !!1!120) -.0) 
u||Xell3XeELL>Q5RR	S0)r3   r:   c                   (     e Zd Zdedef fdZ xZS )MistralDecoderLayerr0   r;   c                 j    t         |   ||       t        ||      | _        t	        |      | _        y )N)r0   r;   )r&   r'   r:   	self_attnr"   mlprD   s      r2   r'   zMistralDecoderLayer.__init__f   s,    +)9Mf%r3   )r4   r5   r6   r    rn   r'   r7   r8   s   @r2   rs   rs   e   s    &} & & &r3   rs   c                       e Zd Zdef fdZ	 ddej                  dej                  dej                  dedef
dZ	e
dej                  d	ed
edej                  dej                  dej                  dededefd       Z xZS )MistralModelr0   c           	          t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        y c c}w N)r&   r'   r   
ModuleListrangenum_hidden_layersrs   layersrD   s      r2   r'   zMistralModel.__init__m   sD     mmEJ6KcKcEde	 3e
es   ArG   input_tensorrI   past_key_valuesrR   c                    | j                   j                  dk(  rS|H|F|d d df   j                         j                         |j	                         d   k7  }|rt        d      |d|v r|S y ||j                         nd}t        |t              }t        |t              }	| j                   j                  dk(  r?|s=|	s;|s9t        j                  |||| j                   j                  | j                        ry |j                  |j                  }}
t!        j"                  |
      j$                  }|j&                  d   }|	s|r|j)                         }n1t        |t         j*                        r|j&                  d   n||z   dz   }| j-                  ||||
|||j&                  d   | j                   |		      }| j                   j                  dk(  r2|0|j                  j.                  d
v r|st        j0                  ||      }|S )Nflash_attention_2rM   r   zYou are attempting to perform batched generation with padding_side='right' this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to  call `tokenizer.padding_side  = 'left'` before tokenizing the input. rS   rQ   )inputs_embedspast_key_values_lengthrT   is_trainingr   )sequence_lengthtarget_lengthdtypedevicerI   
batch_sizer0   r   )cudaxpu)r0   r[   sumitemsize
ValueErrorget_seq_length
isinstancer   r   r   _ignore_causal_mask_sdparT   r_   r   r   ro   finfominrW   get_max_cache_shaperp   5_prepare_4d_causal_attention_mask_with_cache_positiontype_unmask_unattended)r/   rG   r   rI   r   rR   is_padding_rightpast_seen_tokensusing_static_cacheusing_sliding_window_cacher   r   	min_dtyper   r   causal_masks                   r2   _update_causal_maskz MistralModel._update_causal_masks   s    ;;++/BB)o.I#1!R%#8#<#<#>#C#C#EIZIZI\]^I_#_ #$a 
 )c^.C%%
 @O?Z?99;`a'E%/AS%T" KK,,6'+E%%>>*'7#{{99 MM $**L,?,?vKK&**	&,,Q/%);+??AM
 nell; $$R(%7!;  PP+')#))!,;;+ Q 

 KK,,6*%%**o=%
 1CCKQZ[Kr3   r   r   r   r   r   c	                 p   | | j                         dk(  r| }	|	S t        j                  |      j                  }
t        j                  ||f|
||      }	t        j
                  ||      |j                  dd      kD  }|j                  ]t        |t              r||kD  rHt        j
                  ||      |j                  dd      |j                  z
  k  }|j                  |       |	|z  }	|	ddddddf   j                  |ddd      }	| |	j                         }	| j                  d   |kD  r| ddd|f   } | j                  d   }|	ddddddd|f   | ddddddf   j                  |	j                        z   }|dk(  }|	ddddddd|f   j!                  ||
      |	ddddddd|f<   |	S )aV  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            device (`torch.device`):
                The device to place the 4D attention mask on.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
            config (`MistralConfig`):
                The model's configuration class
            past_key_values (`Cache`):
                The cache class that is being used currently to generate
        N   )
fill_valuer   r   )r   rM   r   r   )dimro   r   r   fullarangerb   rT   r   r   bitwise_or_expandclonerW   tor   masked_fill)rG   r   r   r   r   rI   r   r0   r   r   r   diagonal_attend_masksliding_attend_maskmask_lengthpadding_masks                  r2   r   zBMistralModel._prepare_4d_causal_attention_mask_with_cache_position   s   H %.*<*<*>!*C(K: 7 E*..I** -0Ye\bK $)<<f#MP^PfPfgiklPm#m $$0 "/3EF/\iJi*/,,}V*T&..r158M8MM+' )445HI//K%dD!Q&67>>z1bRTUK))//1!''+m;%3A~~4E%FN,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 r3   )F)r4   r5   r6   r    r'   ro   rp   r
   boolr   staticmethodrn   r   r   r   r7   r8   s   @r2   rx   rx   l   s    
} 
 #(QQ llQ 	Q
 Q  Qf BBB B {{	B
 B B B B B Br3   rx   c                       e Zd Zy)MistralForCausalLMNr4   r5   r6    r3   r2   r   r         r3   r   c                       e Zd Zy)MistralForTokenClassificationNr   r   r3   r2   r   r     r   r3   r   c                       e Zd Zy) MistralForSequenceClassificationNr   r   r3   r2   r   r     r   r3   r   c                   D    e Zd ZdZ fdZd Zd Z	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deeeeej                     f      d	eej                     d
eej                     deej                     dee   dee   defdZ xZS )MistralForQuestionAnsweringmodelc                 H    t         |   |       t        |      | _        | `y rz   )r&   r'   rx   r   transformerr.   s     r2   r'   z$MistralForQuestionAnswering.__init__  s"     !&)
r3   c                 .    | j                   j                  S rz   r   embed_tokens)r/   s    r2   get_input_embeddingsz0MistralForQuestionAnswering.get_input_embeddings   s    zz&&&r3   c                 &    || j                   _        y rz   r   )r/   values     r2   set_input_embeddingsz0MistralForQuestionAnswering.set_input_embeddings#  s    "'

r3   	input_idsrG   position_idsr   r   start_positionsend_positionsrR   output_hidden_statesrK   c
           	         | j                  |||||||	      }|j                  }| j                  |      }|j                  dd      \  }}|j	                  d      j                         }|j	                  d      j                         }d}|| | j                  ||||fi |
}t        ||||j                  |j                        S )a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        )rG   r   r   r   rR   r   r   rM   )r   N)lossstart_logits
end_logitsrE   
attentions)
r   last_hidden_state
qa_outputssplitsqueezerc   loss_functionr   rE   r   )r/   r   rG   r   r   r   r   r   rR   r   rJ   outputssequence_outputlogitsr   r   r   s                    r2   rm   z#MistralForQuestionAnswering.forward&  s    0 ,0::)%+'/!5 ,6 ,
 "331#)<<r<#: j#++B/::<''+668
&=+D%4%%lJQ^ibhiD+%!!//))
 	
r3   )	NNNNNNNNN)r4   r5   r6   base_model_prefixr'   r   r   r   ro   rq   FloatTensorr   r
   r   r   r   rm   r7   r8   s   @r2   r   r     s    
'(
 156:37KO596:48,0/33
E,,-3
 !!2!233
 u//0	3

 "%tE4E4E/F(F"GH3
   1 123
 "%"2"233
   0 013
 $D>3
 'tn3
 
&3
r3   r   )3typingr   r   r   r   r   ro   torch.utils.checkpointr   cache_utilsr
   r   r   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   llama.modeling_llamar   r   r   r   r   r   r   r   r   r   configuration_mistralr    
get_loggerr4   r]   _CHECKPOINT_FOR_DOCr"   r:   rs   rx   r   r   r   r   r   r3   r2   <module>r      s    9 9    A A > B U 5 &    1 
		H	%1 Y Y8)~ 8)v&+ &]: ]@	) 		$? 		'E 	A
"; A
r3   