
    %	&h                        d dl mZ d dlmZmZmZmZ d dlZd dlm	Z	 ddl
mZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3 ddl4m5Z5m6Z6  e*jn                  e8      Z9dZ:dZ; G d de      Z< G d de	jz                        Z> G d de	jz                        Z? G d de-      Z@ G d de.      ZA G d d e1      ZB G d! d"e	jz                        ZCd#ZD e'd$eD       G d% d&e#             ZE G d' d(eE      ZF G d) d*e2      ZGd+ZH e'd$eD       G d, d-e5             ZI e'd.eD       G d/ d0eEe             ZJg d1ZKy)2    )partial)CallableOptionalTupleUnionN   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)PretrainedConfig)GenerationMixin)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)BaseModelOutputBaseModelOutputWithPast)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)rope_config_validation)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)add_start_docstrings%add_start_docstrings_to_model_forwardcan_return_tupleloggingreplace_return_docstrings   )GlmAttentionGlmRotaryEmbeddingapply_rotary_pos_emb)LlamaDecoderLayer
LlamaModeleager_attention_forward)WhisperModelshift_tokens_rightzUsefulSensors/moonshine-tinyMoonshineConfigc                   j     e Zd ZdZdZdgZddddZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d	 fd	Z xZS )
r)   a"  
    This is the configuration class to store the configuration of a [`MoonshineModel`]. It is used to instantiate a Moonshine
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Moonshine
    [UsefulSensors/moonshine-tiny](https://huggingface.co/UsefulSensors/moonshine-tiny).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 32768):
            Vocabulary size of the Moonshine model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`MoonshineModel`].
        hidden_size (`int`, *optional*, defaults to 288):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 1152):
            Dimension of the MLP representations.
        encoder_num_hidden_layers (`int`, *optional*, defaults to 6):
            Number of hidden layers in the Transformer encoder.
        decoder_num_hidden_layers (`int`, *optional*, defaults to 6):
            Number of hidden layers in the Transformer decoder.
        encoder_num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        decoder_num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        encoder_num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `encoder_num_key_value_heads=encoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `encoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        decoder_num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `decoder_num_key_value_heads=decoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `decoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `decoder_num_attention_heads`.
        pad_head_dim_to_multiple_of (`int`, *optional*):
            Pad head dimension in encoder and decoder to the next multiple of this value. Necessary for using certain
            optimized attention implementations.
        encoder_hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder.
        decoder_hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        decoder_start_token_id (`int`, *optional*, defaults to 1):
            Corresponds to the "<|startoftranscript|>" token, which is automatically used when no `decoder_input_ids`
            are provided to the `generate` function. It is used to guide the model`s generation process depending on
            the task.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`List[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        partial_rotary_factor (`float`, *optional*, defaults to 0.9):
            Percentage of the query and keys which will have rotary embedding.
        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
            Whether the model is used as an encoder/decoder or not.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        bos_token_id (`int`, *optional*, defaults to 1):
            Denotes beginning of sequences token id.
        eos_token_id (`int`, *optional*, defaults to 2):
            Denotes end of sequences token id.

    Example:

    ```python
    >>> from transformers import MoonshineModel, MoonshineConfig

    >>> # Initializing a Moonshine style configuration
    >>> configuration = MoonshineConfig().from_pretrained("UsefulSensors/moonshine-tiny")

    >>> # Initializing a model from the configuration
    >>> model = MoonshineModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```	moonshinepast_key_valuesencoder_num_key_value_headsencoder_num_attention_headsencoder_num_hidden_layers)num_key_value_headsnum_attention_headsnum_hidden_layersc                    || _         || _        || _        || _        || _        || _        || _        ||}|| _        |	|}	|	| _        |
| _	        || _
        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        t-        |        t/        | `  d||||d| y )N)bos_token_ideos_token_idis_encoder_decoderdecoder_start_token_id )
vocab_sizehidden_sizeintermediate_sizer/   decoder_num_hidden_layersr.   decoder_num_attention_headsr-   decoder_num_key_value_headspad_head_dim_to_multiple_ofencoder_hidden_actdecoder_hidden_actmax_position_embeddingsinitializer_ranger7   	use_cache
rope_thetarope_scalingpartial_rotary_factorr6   attention_biasattention_dropoutr   super__init__)selfr9   r:   r;   r/   r<   r.   r=   r-   r>   r?   r@   rA   rB   rC   r7   rD   rE   rF   rG   r6   rH   rI   r4   r5   kwargs	__class__s                             /var/www/pru.catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/moonshine/modular_moonshine.pyrK   zMoonshineConfig.__init__   s    8 %&!2)B&)B&+F(+F(&.*E'+F(&.*E'+F(+F("4"4'>$!2&<#"$(%:""4,!2 	t$ 	
%%1#9		

 	
    )i   i   i     rQ      rR   NNNgelusilui   g{Gz?   Tg     @Ng?TF        rU   r    )	__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferenceattribute_maprK   __classcell__rN   s   @rO   r)   r)   :   s    {z J#4"5<<8M "#"#$%$%$($($(!! # !3D
 D
rP   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MoonshineEncoderMLPc                    t         |           || _        t        |   | _        t        j                  |j                  |j                        | _	        t        j                  |j                  |j                        | _
        y NrJ   rK   configr	   activation_fnnnLinearr:   r;   fc1fc2rL   re   
hidden_actrN   s      rO   rK   zMoonshineEncoderMLP.__init__  s^    #J/99V//1I1IJ99V55v7I7IJrP   hidden_statesreturnc                 l    | j                  |      }| j                  |      }| j                  |      }|S rc   )ri   rf   rj   )rL   rm   s     rO   forwardzMoonshineEncoderMLP.forward  s4    /**=9/rP   rW   rX   rY   rK   torchTensorrp   r^   r_   s   @rO   ra   ra     s$    KU\\ ell rP   ra   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MoonshineDecoderMLPc                    t         |           || _        t        |   | _        t        j                  |j                  |j                  dz        | _	        t        j                  |j                  |j                        | _
        y )Nr    rd   rk   s      rO   rK   zMoonshineDecoderMLP.__init__  sc    #J/99V//1I1IA1MN99V55v7I7IJrP   rm   rn   c                     | j                  |      }|j                  dd      \  }}| j                  |      |z  }| j                  |      }|S )Nr    )dim)ri   chunkrf   rj   )rL   rm   gates      rO   rp   zMoonshineDecoderMLP.forward  sS    /+11!1<t**40=@/rP   rq   r_   s   @rO   ru   ru     s$    KU\\ ell rP   ru   c                   h    e Zd Zdededededef
 fdZ	 	 	 	 	 ddej                  de	e
ej                  ej                  f      d	e	ej                     d
e	e   de	ej                     de	ej                     dee   de
ej                  e	ej                     e	e
ej                        f   fdZ xZS )MoonshineAttentionre   	layer_idx	is_causalr1   r0   c                 n   |j                  ||d       t        | 	  ||       || _        t	        |d|j
                  |j                  z        | _        | j                  j                  C| j                  j                  }|| j                  |z   dz
  |z  z  }|| j                  z
  | _
        y d| _
        y )N)r1   r0   head_dimrU   r   )updaterJ   rK   r   getattrr:   r1   r   re   r?   head_dim_padding)	rL   re   r~   r   r1   r0   target_multipletarget_head_dimrN   s	           rO   rK   zMoonshineAttention.__init__'  s     	.AZmno+"
F4F4F&JdJd4de ;;22>"kkEEO-$--/2QTU2UZi1ijO$3dmm$CD!$%D!rP   rm   position_embeddingsattention_maskpast_key_valuecache_positionkey_value_statesrM   rn   c                    |j                   d d \  }}	| j                  |      j                  ||	| j                  j                  | j
                        j                  dd      }
|d u}|Y|j                  j                  | j                        }|r&d|j                  | j                  <   |j                  }n|j                  }||n|}|r7|r5r3|j                  | j                     }|j                  | j                     }n| j                  |      j                  |d| j                  j                  | j
                        j                  dd      }| j                  |      j                  |d| j                  j                  | j
                        j                  dd      }|r%|#|j!                  ||| j                  d|i      \  }}|s?|\  }}t#        |
|||      \  }
}|'|||d}|j!                  ||| j                  |      \  }}t$        }| j                  j&                  dk7  r^| j                  j&                  dk(  r(|j                  d	d
      rt(        j+                  d       nt,        | j                  j&                     }| j.                  r	||	dkD  rdnd
}| j0                  dkD  rt2        j4                  j6                  j9                  |
d| j0                  f      }
t2        j4                  j6                  j9                  |d| j0                  f      }t2        j4                  j6                  j9                  |d| j0                  f      } || |
|||f| j:                  sdn| j<                  | j>                  |d|\  }}| j0                  dkD  r|dd | j0                   f   }|jA                  ||	d      jC                         }| jE                  |      }||fS )Nrx   rU   r    Tr   )sincosr   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   rV   )dropoutscalingr   .)#shapeq_projviewre   r0   r   	transpose
is_updatedgetr~   cross_attention_cacheself_attention_cache	key_cachevalue_cachek_projv_projr   r#   r&   _attn_implementationloggerwarning_oncer   r   r   rr   rg   
functionalpadtrainingrI   r   reshape
contiguouso_proj)rL   rm   r   r   r   r   r   rM   bszq_lenquery_statesis_cross_attentionr   current_states
key_statesvalue_statesr   r   cache_kwargsattention_interfacer   attn_outputattn_weightss                          rO   rp   zMoonshineAttention.forward<  s    #(("-
U KK&++C8W8WY]YfYfgqqrsuvw 	 .T9%'2266t~~FJ!<@))$..9!/!E!E!/!D!D .>-I)}.Z'11$..AJ)55dnnEL N+c2t{{>>N1a  N+c2t{{>>N1a 
 "n&@+9+@+@dnn?OQ_>`,(
L "*HC';L*VY[^'_$L*)'*3.Y+9+@+@dnnl,(
L )@;;++w6{{//69fjjI\^c>d##L
 '>dkk>^>^&_# NN~/E%RS)DY^	  1$ 88..22<!TEZEZA[\L,,00aAVAV=WXJ 88..22<!TEZEZA[\L$7
%
  $}}C$2H2HLL
%
 
%
!\   1$%c+Cd.C.C-C+C&CDK!))#ub9DDFkk+.L((rP   )NNNNN)rW   rX   rY   r)   intboolrK   rr   rs   r   r   r
   
LongTensorr   r   rp   r^   r_   s   @rO   r}   r}   &  s   && & 	&
 !& !&0 LP15*.5937[)||[) &eELL%,,,F&GH[) !.	[)
 ![) !!1!12[) #5<<0[) -.[) 
u||Xell3XeELL>Q5RR	S[)rP   r}   c                       e Zd Zy)MoonshineRotaryEmbeddingN)rW   rX   rY   r8   rP   rO   r   r     s    rP   r   c                   (     e Zd Zdedef fdZ xZS )MoonshineEncoderLayerre   r~   c                 F   t         |   ||       t        ||d|j                  |j                        | _        t        ||j                        | _        t        j                  |j                  d      | _        t        j                  |j                  d      | _        y )NFre   r~   r   r1   r0   bias)rJ   rK   r}   r.   r-   	self_attnra   r@   mlprg   	LayerNormr:   input_layernormpost_attention_layernormrL   re   r~   rN   s      rO   rK   zMoonshineEncoderLayer.__init__  s    ++ & B B & B B
 'vv/H/HI!||F,>,>UK(*V5G5Ge(T%rP   )rW   rX   rY   r)   r   rK   r^   r_   s   @rO   r   r     s    U U3 U UrP   r   c                        e Zd Zddedee   f fdZ	 	 	 	 	 	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     d	eej                     d
ee
   dee   dee   deej                     deeej                  ej                  f      deeej                  ej                  f      deej                  eeej                  ej                  f      f   fdZ xZS )MoonshineDecoderLayerre   r~   c                    t         |           |j                  | _        t        ||d|j                  |j
                        | _        t        ||d|j                  |j
                        | _        t        ||j                        | _
        t        j                  |j                  d      | _        t        j                  |j                  d      | _        t        j                  |j                  d      | _        y )NTr   Fr   )rJ   rK   r:   r}   r=   r>   r   encoder_attnru   rA   r   rg   r   r   r   final_layernormr   s      rO   rK   zMoonshineDecoderLayer.__init__  s    !--+ & B B & B B
 / & B B & B B
 'vv/H/HI!||F,>,>UK(*V5G5Ge(T%!||F,>,>UKrP   rm   r   encoder_hidden_statesencoder_attention_maskposition_idsencoder_position_idsr   r   rD   r   r   encoder_position_embeddingsrn   c                 H   |}| j                  |      } | j                  d||||||	|
|d|\  }}||z   }d }|2|}| j                  |      }| j                  ||||||	      \  }}||z   }|}| j	                  |      }| j                  |      }||z   }|f}|r|||fz  }|S )N)rm   r   r   r   r   rD   r   r   )rm   r   r   r   r   rD   r8   )r   r   r   r   r   r   )rL   rm   r   r   r   r   r   r   r   rD   r   r   r   rM   residualself_attn_weightscross_attn_weightsoutputss                     rO   rp   zMoonshineDecoderLayer.forward  s     !,,]; ,:4>> 
,
')%)/) 3
,
 
,
(( !=0 " ,$H 99-HM040A0A+!65-"3# 1B 1-M- %}4M !,,];/ =0 ")+=>>GrP   rc   )NNNNNNFFNNN)rW   rX   rY   r)   r   r   rK   rr   rs   r   r
   r   r   FloatTensorrp   r^   r_   s   @rO   r   r     sj   L L8C= L6 268<9=37;?*.,1$)59KOSW<||< !.<  (5	<
 !) 6< u//0< 'u'7'78< !< $D>< D>< !!1!12< &eELL%,,,F&GH< &.eELL%,,4N.O%P< 
u  (51B1BEDUDU1U+V"WW	X<rP   r   aN  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`MoonshineConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
zWThe bare Moonshine Model outputting raw hidden-states without any specific head on top.c                   Z    e Zd ZeZdZdZdZddgZdZ	dZ
dZdZd Zdej                  fdZy	)
MoonshinePreTrainedModelmodelinput_valuesTr   r   c                 6   | j                   j                  }t        |t        j                  t        j
                  f      rY|j                  j                  j                  d|       |j                  %|j                  j                  j                          y y t        |t        j                        rf|j                  j                  j                  d|       |j                  2|j                  j                  |j                     j                          y y y )NrV   )meanstd)re   rC   
isinstancerg   rh   Conv1dweightdatanormal_r   zero_	Embeddingpadding_idx)rL   moduler   s      rO   _init_weightsz&MoonshinePreTrainedModel._init_weights'  s    kk++fryy"))45MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> . .rP   input_lengthsc                 ~    t        |dz
  dz  dz         }t        |dz
  dz  dz         }t        |dz
  dz  dz         }|S )zH
        Computes the output length of the convolutional layers
           @   rU      r   r    )r   )rL   r   output_conv1_lengthoutput_conv2_lengthoutput_conv3_lengths        rO    _get_feat_extract_output_lengthsz9MoonshinePreTrainedModel._get_feat_extract_output_lengths2  sZ     "=3#6""<q"@A!#6#:a"?!"CD!#6#:a"?!"CD""rP   N)rW   rX   rY   r)   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_flash_attn_2_supports_sdpa_supports_cache_class_supports_static_cacher   rr   r   r   r8   rP   rO   r   r     sT    
 #L$O&*#02IJ!N !	?#e>N>N #rP   r   c                        e Zd ZdZdZdef fdZdej                  fdZ	dej                  fdZ
e	 	 	 	 ddeej                     d	eej                     d
ee   dee   dee   defd       Z xZS )MoonshineEncoderz
    Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]

    Args:
        config: MoonshineConfig
    r   re   c           	      b   t         |   |       || _        |j                  }t	        j
                  d|ddd      | _        t	        j
                  |d|z  dd	      | _        t	        j
                  d|z  |dd	      | _        t	        j                  d|d
      | _
        t        |      | _        t	        j                  t        |j                        D cg c]  }t!        ||       c}      | _        t	        j$                  |d      | _        d| _        | j+                          y c c}w )NrU   r   r   F)kernel_sizestrider   r    r   r   )r   r   gh㈵>)
num_groupsnum_channelseps)re   r   )rJ   rK   re   r:   rg   r   conv1conv2conv3	GroupNorm	groupnormr   
rotary_emb
ModuleListranger/   r   layersr   
layer_normgradient_checkpointing	post_init)rL   re   	embed_dimidxrN   s       rO   rK   zMoonshineEncoder.__init__G  s     &&	YYq)ReT
YYy!i-QqQ
YYq9}iQqQ
PTU2&Amm;@AaAa;bcC"63/c
 ,,yu=&+# ds   D,rn   c                     | j                   S rc   r   rL   s    rO   get_input_embeddingsz%MoonshineEncoder.get_input_embeddings[  s    zzrP   valuec                     || _         y rc   r  )rL   r  s     rO   set_input_embeddingsz%MoonshineEncoder.set_input_embeddings^  s	    
rP   r   r   output_hidden_statesflash_attn_kwargsc                 H   ||n| j                   j                  }||n| j                   j                  }|t        d      |j	                  d      }t
        j                  j                  | j                  |            }| j                  |      }t
        j                  j                  | j                  |            }t
        j                  j                  | j                  |            }|j                  ddd      }|| j                  |j                  d         }d}|ddd|f   dd|f   }| j                   j                   d	k(  r|d
k(  j#                         r|nd}nH| j                   j                   dk(  r|st%        ||j&                        }nt)        ||j&                        }t+        j,                  d|j                  d   |j.                        j	                  d      }	| j1                  ||	      }
|rdnd}|rdnd}| j2                  D ]e  }|r||fz  }| j4                  r0| j6                  r$| j9                  |j:                  |||	d|dd|
	      }n ||f||	||
d|}|d   }|s]||d   fz  }g | j=                  |      }|r||fz  }t?        |||      S )a  
        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
                Float values of the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_values`, the [`AutoFeatureExtractor`] should be used for padding
                and conversion into a tensor of type `torch.FloatTensor`.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
                tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
                more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzYou must specify input_values.rU   r   r    rx     .flash_attention_2rV   r   devicer8   F)r   r   r   r   last_hidden_staterm   
attentions) re   r   r  
ValueError	unsqueezerg   r   tanhr   r  rS   r  r  permuter   r   r   anyr   dtyper   rr   aranger  r  r  r
  r   _gradient_checkpointing_func__call__r	  r   )rL   r   r   r   r  r  rm   mask_lendownsample_strider   r   all_hidden_statesall_self_attnsencoder_layerlayer_outputss                  rO   rp   zMoonshineEncoder.forwarda  s   > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 =>> $--a0**4::l+CD}5**4::m+DE**4::m+DE%--aA6 %<<^=Q=QRT=UVH *+C1D3D1D,DEc9H9nUN{{//3FF4Bc4I3N3N3PVZ 11V;DU!D^UbUhUh!i "<NML_L_!`||A}':':1'=mFZFZ[eefgh #oom\J #7BD0d![[ 	6M#!m%55!**t}} $ A A!**!" %'
! !.!!#1!-&7(;! (! *!,M =#3"55;	6> 6  -!11&++%
 	
rP   )NNNN)rW   rX   rY   rZ   r   r)   rK   rg   Moduler  r  r   r   rr   r   rs   r   r   r   r   rp   r^   r_   s   @rO   r   r   =  s     %O (bii "))   5915,0/3p
u001p
 !.p
 $D>	p

 'tnp
 $$89p
 
!p
 p
rP   r   c                   Z    e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee
   deej                     dee   d	ee   d
ee   deej                     deej                     deej                     dee   deeef   fdZ xZS )MoonshineDecoder	input_idsre   c           	         t         |   |       t        j                  |j                  d      | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _
        y c c}w NFr   )rJ   rK   rg   r   r:   normr  r  r<   r   r  )rL   re   r  rN   s      rO   rK   zMoonshineDecoder.__init__  s\     LL!3!3%@	mm;@AaAa;bcC"63/c
cs   A=r   r   r,   inputs_embedsrD   r   r  r   r   r   r  rn   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|du |duz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|| j                  |      }|r"| t               }t               }t        ||      }|	F||j                         nd}t        j                  |||j                  d   z   |j                         }	||	j#                  d      }| j%                  |||	||      }|}| j'                  ||      }|rdnd}|rdnd}|r|
dnd}||
j                  d	   }d
}|ddd|f   dd|f   }| j                   j(                  dk(  r|dk(  j+                         r|nd}nd| j                   j(                  dk(  r'|s%t-        ||j.                  |j                  d	         }n$t1        ||j.                  |j                  d	         }| j2                  D ]  }|r||fz  }| j
                  r;| j                  r/| j5                  t7        |j8                  fi ||||
|||||	|
      }n ||f|||
|||||	|d	|}|d   }|sm||d   fz  }|
y||d   fz  } | j;                  |      }|r||fz  }t=        ||r|nd|||      S )a  
        Args:
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding indices in `encoder_hidden_states`. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
        Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   rU   r  r8   r  .r  rV   r   )	r   r   r   r   r   r   rD   r   r   r    )r  r,   rm   r  cross_attentions)re   r   r  rD   r  r
  r   r   r   embed_tokensr   r   get_seq_lengthrr   r%  r   r  r   _update_causal_maskr  r   r#  r   r$  r   r  r&  r   r'  r4  r   )rL   r1  r   r   r,   r5  rD   r   r  r   r   r   r  r   r   past_seen_tokenscausal_maskrm   r   r*  r+  all_cross_attentionsr(  r)  decoder_layerr-  s                             rO   rp   zMoonshineDecoder.forward  s   4 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M0#/> $0N!12FH]^O!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L..M>?L]
 & #oom\J #7BD0d&7<Q<]rdh "-,2226H *%;CATCTAT<T%UVY[d\d[dVd%e"{{//3FFDZ^aDaCfCfCh)?nr& 11V;DU)L*M,?,?ATATUWAX*&
 *D*M,?,?ATATUWAX*& "[[ &	@M#!m%55!**t}} $ A AM22H6GH!) #%"'! !.!!#.+A*?!-#2&7'#1(;! (! *!,M =#3"55(4(]1-=,??(M&	@P 		-0  -!118+/8Od+%1
 	
rP   )NNNNNNNNNNN)rW   rX   rY   r   r)   rK   r   rr   r   rs   r
   r   r   r   r   r   r   r   rp   r^   r_   s   @rO   r0  r0    s4   !O
 
 151537+/59$(,0/359=A9=P
E,,-P
 !.P
 u//0	P

 "%P
   1 12P
 D>P
 $D>P
 'tnP
 !!1!12P
  ((9(9:P
 !) 6P
 $$89P
 
u--	.P
rP   r0  a  
    Args:
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        decoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.

            Two formats are allowed:
            - a [`~cache_utils.Cache`] instance, see our
            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
            cache format.

            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
            legacy cache format will be returned.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that don't
            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids`
            of shape `(batch_size, sequence_length)`.
        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `decoder_position_ids`,
            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
            the complete sequence length.
c                      e Zd Ze ee       eee      	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     de	e
j                     de	eee
j                           de	eeee
j                     f      d	e	ee
j                        d
e	ee
j                        de	e   de	e   de	e   de	e
j                     defd                     Zy)MoonshineModeloutput_typer   Nr   r   decoder_input_idsdecoder_attention_maskencoder_outputsr,   decoder_inputs_embedsdecoder_position_idsrD   r   r  r   rn   c                 n   |
|
n| j                   j                  }
||n| j                   j                  }|	|	n| j                   j                  }	|| j	                  |||
|      }nGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }| j                  ||||j                  ||||	|
||      }t        |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                        S )	aZ  
        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, MoonshineModel
        >>> from datasets import load_dataset

        >>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values
        >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
        >>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 2, 288]
        ```N)r   r   r  r   rU   r    r  )r1  r   r   r   r,   r5  r   rD   r   r  r   )r  r,   decoder_hidden_statesdecoder_attentionsr8  encoder_last_hidden_stater   encoder_attentions)re   r   r  rD   encoderr   r   lendecoderr  r   r,   rm   r  r8  )rL   r   r   rD  rE  rF  r,   rG  rH  rD   r   r  r   decoder_outputss                 rO   rp   zMoonshineModel.forward  s[   D 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	"/3||-"3%9	 0< 0O O_=-"1!"4474H14Loa0RV14_1E1I?1-tO FJ\\'1#1"1"C"C+/-/!5) FR F
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
rP   )NNNNNNNNNNNN)rW   rX   rY   r   r    MOONSHINE_MODEL_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCr   rr   r   r   r   r   r   r   rp   r8   rP   rO   rA  rA    s~   
 *+KL+=O\ 59598<=AEIZ^DHBF$(,0/359L
u001L
 !!1!12L
 $E$4$45	L

 !))9)9 :L
 "%e.?.?(@"ABL
 "%(;U5CTCT=U(U"VWL
  (e.?.?(@AL
 'uU-=-='>?L
 D>L
 $D>L
 'tnL
 !!1!12L
 
L
 ] M L
rP   rA  z`The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.c                    6    e Zd ZdgZdef fdZd Zd Zd Zd Z	de
j                  fd	Ze ee       eee
      	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej*                     deej,                     deej,                     deej,                     deeeej*                           deeeeej*                     f      deeej*                        deeej,                        dee   dee   dee   deej,                     deej,                     defd                     Z xZS )!MoonshineForConditionalGenerationzproj_out.weightre   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y r3  )
rJ   rK   rA  r   rg   rh   r:   r9   proj_outr  )rL   re   rN   s     rO   rK   z*MoonshineForConditionalGeneration.__init__*  sH     #F+
		&"4"4f6G6GeT 	rP   c                 6    | j                   j                         S rc   )r   get_encoderr  s    rO   rY  z-MoonshineForConditionalGeneration.get_encoder2      zz%%''rP   c                 6    | j                   j                         S rc   )r   get_decoderr  s    rO   r\  z-MoonshineForConditionalGeneration.get_decoder5  rZ  rP   c                     | j                   S rc   rW  r  s    rO   get_output_embeddingsz7MoonshineForConditionalGeneration.get_output_embeddings8  s    }}rP   c                     || _         y rc   r^  )rL   new_embeddingss     rO   set_output_embeddingsz7MoonshineForConditionalGeneration.set_output_embeddings;  s	    &rP   rn   c                 6    | j                   j                         S rc   )r   r  r  s    rO   r  z6MoonshineForConditionalGeneration.get_input_embeddings>  s    zz..00rP   rB  r   r   rD  rE  rF  r,   rG  rH  rD   r   r  r   labelsc                    |9|7|5t        || j                  j                  | j                  j                        }| j	                  |||||||||	|
||      }| j                  |j                        }d}|(| j                  ||| j                  j                        }t        |||j                  |j                  |j                  |j                  |j                  |j                  |j                   	      S )aK  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
            only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values

        >>> generated_ids = model.generate(input_values, max_new_tokens=100)

        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> transcription
        'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
        ```N)r   rD  rF  rE  r,   rG  rH  rD   r   r  r   )logitsrd  r9   )	lossrf  r,   rJ  rK  r8  rL  r   rM  )r(   re   pad_token_idr7   r   rW  r  loss_functionr9   r   r,   rJ  rK  r8  rL  r   rM  )rL   r   r   rD  rE  rF  r,   rG  rH  rD   r   r  r   rd  r   rf  rg  s                    rO   rp   z)MoonshineForConditionalGeneration.forwardA  s
   b  (-B-J$6DKK44dkk6X6X%! '+jj)/+#9+"7!5/!5) '1 '
 w889%%VFt{{OeOe%fD#33")"?"?&99$55&-&G&G")"?"?&99

 
	
rP   )NNNNNNNNNNNNN)rW   rX   rY   _tied_weights_keysr)   rK   rY  r\  r_  rb  rg   r.  r  r   r   rR  r   r   rS  r   rr   r   r   r   r   r   r   rp   r^   r_   s   @rO   rU  rU  #  s   
 ,, (('1bii 1 *+KL?Y 59598<=AEIZ^DHBF$(,0/359-1R
u001R
 !!1!12R
 $E$4$45	R

 !))9)9 :R
 "%e.?.?(@"ABR
 "%(;U5CTCT=U(U"VWR
  (e.?.?(@AR
 'uU-=-='>?R
 D>R
 $D>R
 'tnR
 !!1!12R
 ))*R
 
R
 Z M R
rP   rU  )r)   rA  r   rU  )L	functoolsr   typingr   r   r   r   rr   torch.nnrg   activationsr	   cache_utilsr
   r   r   configuration_utilsr   
generationr   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   glm.modeling_glmr!   r"   r#   llama.modeling_llamar$   r%   r&   whisper.modeling_whisperr'   r(   
get_loggerrW   r   _CHECKPOINT_FOR_DOCrS  r)   r.  ra   ru   r}   r   r   r   MOONSHINE_START_DOCSTRINGr   r   r0  rR  rA  rU  __all__r8   rP   rO   <module>r     s    3 3   ! C C 3 ) C  : F &  V U Y Y G 
		H	%4 #J
& J
Z")) "))  q) q)h	1 	U- U"UBII Up " ]# #	#BU
/ U
pZ
z Z
zW$  t ]P
\ P
	P
f fo
(@/ o
	o
drP   