
    %	&h#                     V   d dl mZmZmZ d dlZd dlmZ d dlmc mZ	 d dl
ZddlmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZ d	d
lmZ  ej6                  e      Z G d dej<                        Z G d de      Z  G d de      Z! G d de      Z" G d de      Z# G d de      Z$y)    )CallableOptionalTupleN   )Cache)ALL_ATTENTION_FUNCTIONS)logging   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaMLP
LlamaModelapply_rotary_pos_embeager_attention_forward   )
OlmoConfigc                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )OlmoLayerNormz/LayerNorm but with no learnable weight or bias.hidden_sizereturnNc                 2    t         |           |f| _        y N)super__init__normalized_shape)selfr   	__class__s     {/var/www/pru.catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/olmo/modular_olmo.pyr   zOlmoLayerNorm.__init__   s    !,    hidden_statesc                     |j                   }t        j                  |j                  t        j
                        | j                  d d d      j                  |      S )N)dtypegh㈵>)eps)r#   F
layer_normtotorchfloat32r   )r   r!   
orig_dtypes      r   forwardzOlmoLayerNorm.forward!   sO    "((
||M,,5==,A4CXCXZ^`djnorr
 	
r    )
__name__
__module____qualname____doc__intr   r(   Tensorr+   __classcell__r   s   @r   r   r      s4    9/C /D /
U\\ 
ell 
r    r   c                        e Zd Z fdZ xZS )OlmoMLPc                 J   t         |   |       t        j                  | j                  | j
                  d      | _        t        j                  | j                  | j
                  d      | _        t        j                  | j
                  | j                  d      | _        y )NF)bias)	r   r   nnLinearr   intermediate_size	gate_projup_proj	down_proj)r   configr   s     r   r   zOlmoMLP.__init__)   ss     4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXr    )r,   r-   r.   r   r2   r3   s   @r   r5   r5   (   s    Y Yr    r5   c                      e Zd Z	 	 d	dej                  deej                  ej                  f   deej                     dee   deej                     deej                  eej                     eeej                        f   fdZ	y)
OlmoAttentionNr!   position_embeddingsattention_maskpast_key_valuecache_positionr   c                 V   |j                   d d }g |d| j                  }| j                  |      }	| j                  |      }
| j	                  |      }| j
                  j                  |	j                  | j
                  j                   | j
                  j                         |
j                  | j
                  j                   | j
                  j                         |j                  | j
                  j                   | j
                  j                         |	j                  |      j                  dd      }	|
j                  |      j                  dd      }
|j                  |      j                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j
                  j                  dk7  r^| j
                  j                  dk(  r(|j                  dd	      rt         j#                  d
       nt$        | j
                  j                     } || |	|
||f| j&                  sdn| j(                  | j*                  d|\  }} |j,                  g |d j/                         }| j1                  |      }||fS )N)minmaxr   r
   )sincosrD   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.g        )dropoutscaling)shapehead_dimq_projk_projv_projr>   clip_qkvclamp_view	transposer   update	layer_idxr   _attn_implementationgetloggerwarning_oncer   trainingattention_dropoutrO   reshape
contiguouso_proj)r   r!   rA   rB   rC   rD   kwargsinput_shapehidden_shapequery_states
key_statesvalue_statesrJ   rI   cache_kwargsattention_interfaceattn_outputattn_weightss                     r   r+   zOlmoAttention.forward1   sy    $))#2.88b8$--8{{=1[[/
{{=1;;+T[[%9%9$9t{{?S?ST4;;#7#7"7T[[=Q=QRT[[%9%9$9t{{?S?ST#((6@@AF__\2<<QB
#((6@@AF&S#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$J(?;;++w6{{//69fjjI\^c>d##L
 '>dkk>^>^&_#$7	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r    )NN)
r,   r-   r.   r(   r1   r   r   r   
LongTensorr+    r    r   r@   r@   0   s     +/598)||8) #5<<#=>8) !.	8)
 !8) !!1!128) 
u||Xell3XeELL>Q5RR	S8)r    r@   c                   (     e Zd Zdedef fdZ xZS )OlmoDecoderLayerr>   rZ   c                     t         |   ||       t        |j                        | _        t        |j                        | _        t        ||      | _        y )N)r>   rZ   )r   r   r   r   input_layernormpost_attention_layernormr@   	self_attnr   r>   rZ   r   s      r   r   zOlmoDecoderLayer.__init__m   sF    +,V-?-?@(5f6H6H(I%&f	Jr    )r,   r-   r.   r   r0   r   r2   r3   s   @r   rq   rq   l   s    Kz Kc K Kr    rq   c                   $     e Zd Zdef fdZ xZS )	OlmoModelr>   c           	          t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        t        |j                        | _
        y c c}w r   )r   r   r8   
ModuleListrangenum_hidden_layersrq   layersr   r   normrv   s      r   r   zOlmoModel.__init__u   s[     mmBGH`H`BabYfi0b
 "&"4"45	 cs   A1)r,   r-   r.   r   r   r2   r3   s   @r   rx   rx   t   s    6z 6 6r    rx   c                       e Zd Zy)OlmoForCausalLMN)r,   r-   r.   ro   r    r   r   r   }   s    r    r   )%typingr   r   r   r(   torch.nnr8   torch.nn.functional
functionalr%   torch.utils.checkpointcache_utilsr   modeling_utilsr   utilsr	   llama.modeling_llamar   r   r   r   r   r   r   configuration_olmor   
get_loggerr,   r]   Moduler   r5   r@   rq   rx   r   ro   r    r   <module>r      s    , ,        5    + 
		H	%
BII 
Yh Y9)N 9)xK( K6
 6	& 	r    