
    %	&h_                        d dl Z d dlZd dlmZ d dlmZmZmZ d dlZd dl	m
Z
 ddlmZmZmZmZ ddlmZ ddlmZmZmZmZmZ dd	lmZmZmZmZmZmZmZm Z m!Z! d
dl"m#Z#  ejH                  e%      Z&dZ'dZ(dZ)g dZ*dZ+dZ,e G d de             Z- G d de!      Z. G d de      Z/ G d de      Z0 G d de      Z1 G d de      Z2 G d de      Z3 G d d e      Z4d!Z5d"Z6eZ7 ed#e5       G d$ d%e4e              Z8 ed&e5       G d' d(e4             Z9 ed)e5d*       G d+ d,e             Z: ed-e5       G d. d/e             Z;g d0Z<y)1    N)	dataclass)OptionalTupleUnion   )CausalLMOutputModelOutputSequenceClassifierOutputWav2Vec2BaseModelOutput)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )	Wav2Vec2EncoderWav2Vec2EncoderStableLayerNormWav2Vec2FeatureEncoderWav2Vec2FeatureProjectionWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2GumbelVectorQuantizerWav2Vec2ModelWav2Vec2PositionalConvEmbedding   )UniSpeechConfigr   z/patrickvonplaten/unispeech-large-1500h-cv-timit)r   i$  i   zW'mister quilter is the apposl of the midle classes and weare glad to welcom his gosepl'gQ+1@c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeej                     ed<   dZeeej                        ed<   dZeeej                        ed<   y)	UniSpeechForPreTrainingOutputaL  
    Output type of [`UniSpeechForPreTrainingOutput`], with potential hidden states and attentions.

    Args:
        loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`):
            Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
            paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
        projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
            Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
            projected quantized states.
        projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
            Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
            target vectors for contrastive loss.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlossprojected_statesprojected_quantized_statescodevector_perplexityhidden_states
attentions)__name__
__module____qualname____doc__r    r   torchFloatTensor__annotations__r!   r"   r#   r$   r   r%        /var/www/pru.catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/unispeech/modular_unispeech.pyr   r   1   s    4 )-D(5$$
%,48hu0018>B):): ;B9=8E$5$56=8<M8E%"3"345<59Ju00129r.   r   c                       e Zd Zy) UniSpeechPositionalConvEmbeddingNr&   r'   r(   r-   r.   r/   r1   r1   U       r.   r1   c                       e Zd Zy)UniSpeechFeatureEncoderNr2   r-   r.   r/   r5   r5   Y   r3   r.   r5   c                       e Zd Zy)UniSpeechFeatureProjectionNr2   r-   r.   r/   r7   r7   ]   r3   r.   r7   c                       e Zd Zy)UniSpeechEncoderNr2   r-   r.   r/   r9   r9   a   r3   r.   r9   c                       e Zd Zy)UniSpeechEncoderStableLayerNormNr2   r-   r.   r/   r;   r;   e   r3   r.   r;   c                   "    e Zd Zed        Zd Zy)UniSpeechGumbelVectorQuantizerc           	          | j                  d      }t        j                  t        j                  |t        j                  |dz         z  d             j                         }|S )Nr   dimgHz>)meanr*   expsumlog)probsmarginal_probs
perplexitys      r/   _compute_perplexityz2UniSpeechGumbelVectorQuantizer._compute_perplexityj   sR    *YY		.599^VZEZ;[*[ac ddeiik
r.   c                    |j                   \  }}}| j                  |      }|j                  ||z  | j                  z  d      }| j                  rt
        j                  j                  |j                         | j                  d      j                  |      }t        j                  |j                  ||z  | j                  d      j                         d      }| j                  |      }n}|j                  d      } |j                  |j                    j!                  d|j                  dd      d      }|j                  ||z  | j                  d      }| j                  |      }|j                  ||z  d      }|j#                  d      | j$                  z  }	|	j                  ||z  | j                  | j&                  d      }
|
j)                  d      j                  ||d      }
|
|fS )NrA   T)tauhardr?   r         ?)shapeweight_projview
num_groupstrainingnn
functionalgumbel_softmaxfloattemperaturetype_asr*   softmaxrI   argmax	new_zerosscatter_	unsqueezecodevectorsnum_varsrD   )selfr$   
batch_sizesequence_lengthhidden_sizecodevector_probscodevector_soft_distrH   codevector_idxcodevectors_per_groupr_   s              r/   forwardz&UniSpeechGumbelVectorQuantizer.forwardp   s   3@3F3F0
O[ ((7%**:+G$//+Y[]^==!}};;##%4+;+;$  <  gm$ 
 $)=="":#?RTU[[]ce$  112FGJ +11b19N6}668K8KLUUN''A.   044Z/5QSWSbSbdfg112BCJ+00o1MrR 0 : :2 >AQAQ Q+00o1Mt`d`m`moqr!oob)..z?BOJ&&r.   N)r&   r'   r(   staticmethodrI   ri   r-   r.   r/   r=   r=   i   s     
#'r.   r=   c                   |    e Zd ZdZeZdZdZdZdZ	dZ
d Zdeej                  ef   fdZded	ej                  fd
Zy)UniSpeechPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    	unispeechinput_valuesTc           
      z   t        |t              r|j                  j                  j                  j                  dd       |j                  j                  j                  j                          t        j                  j                  |j                         yt        |t              rt        j                  j                  |j                  j                  ddt        j                  d|j                  j                   d   |j                  j"                  z  z        z         t        j                  j%                  |j                  j                  d       yt        |t&              rt        j                  d|j(                  j*                  z        }t        j                  j                  |j(                  j                  | |       t        j                  j                  |j(                  j                  | |       yt        |t        j,                        rm|j                  j                  j                  d| j.                  j0                         |j                  %|j                  j                  j                          yyt        |t        j2                  t        j4                  f      rJ|j                  j                  j                          |j                  j                  j7                  d       yt        |t        j8                        rt        j                  j;                  |j                         |j                  jt        j                  |j<                  |j"                  |j                   d   z  z        }t        j                  j                  |j                  | |       yyy)	zInitialize the weights        r   )rB   stdr   r   )abNrM   )
isinstancer=   rP   weightdatanormal_biaszero_rT   inituniform_r_   r1   convmathsqrtkernel_sizein_channels	constant_r7   
projectionin_featuresLinearconfiginitializer_range	LayerNorm	GroupNormfill_Conv1dkaiming_normal_groups)ra   moduleks      r/   _init_weightsz&UniSpeechPreTrainedModel._init_weights   s    f<=%%**222C##((..0GGV//0 @AGGOO""		!v{{'>'>q'AFKKD[D['["\]]  
 GGfkk..2 :;		!f//;;;<AGGV..55!qAGGV..33rQ?		*MM&&CT[[5R5R&S{{&  &&( 'r|| <=KK""$MM$$S)		*GG##FMM2{{&IIfmmv/A/AFDVDVWXDY/YZ[  a 8 ' +r.   input_lengthsc                     d }t        | j                  j                  | j                  j                        D ]  \  }} ||||      } |S )zH
        Computes the output length of the convolutional layers
        c                 >    t        j                  | |z
  |d      dz   S )Nfloor)rounding_moder   )r*   div)input_lengthr   strides      r/   _conv_out_lengthzSUniSpeechPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length   s"     99\K7wWZ[[[r.   )zipr   conv_kernelconv_stride)ra   r   r   r   r   s        r/    _get_feat_extract_output_lengthsz9UniSpeechPreTrainedModel._get_feat_extract_output_lengths   sQ    
	\
 $'t{{'>'>@W@W#X 	QK,]KPM	Q r.   feature_vector_lengthattention_maskc                    |j                  d      d d df   }| j                  |      j                  t        j                        }|j
                  d   }t        j                  ||f|j                  |j                        }d|t        j                  |j
                  d   |j                        |dz
  f<   |j                  dg      j                  d      j                  dg      j                         }|S )NrA   r?   r   )dtypedevicer   )r   )cumsumr   tor*   longrO   zerosr   r   arangeflipbool)ra   r   r   non_padded_lengthsoutput_lengthsrb   s         r/   "_get_feature_vector_attention_maskz;UniSpeechPreTrainedModel._get_feature_vector_attention_mask   s     ,22r2:1b5A>>?QRUUV[V`V`a#))!,
./~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr.   N)r&   r'   r(   r)   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_2_supports_sdpar   r   r*   
LongTensorintr   r   r-   r.   r/   rl   rl      sg    
 #L#$O&*#!N9BeEDTDTVYDY>Z  ]b]m]m r.   rl   a  
    UniSpeech was proposed in [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled
    Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei,
    Michael Zeng, Xuedong Huang.

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving etc.).

    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`UniSpeechConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aI  
    Args:
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
            1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            <Tip warning={true}>

            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
            True`. For all models whose processor has `config.return_attention_mask == False`, `attention_mask` should
            **not** be passed to avoid degraded performance when doing batched inference. For such models
            `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware that these
            models also yield slightly different results depending on whether `input_values` is padded or not.

            </Tip>

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zcThe bare UniSpeech Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd ZdefdZd Zd Z ee       e	e
eede      	 	 	 	 	 ddeej                      d	eej                      d
eej"                     dee   dee   dee   deeef   fd              Zy)UniSpeechModelr   c                    t         j                  |       || _        t        |      | _        t        |      | _        |j                  dkD  s|j                  dkD  rEt        j                  t        j                  |j                        j                               | _        |j                   rt#        |      | _        nt'        |      | _        | j)                          y )Nrp   )rl   __init__r   r5   feature_extractorr7   feature_projectionmask_time_probmask_feature_probrT   	Parameterr*   Tensorrd   r{   masked_spec_embeddo_stable_layer_normr;   encoderr9   	post_init)ra   r   s     r/   r   zUniSpeechModel.__init__"  s     ))&1!8!@"<V"D  3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"&&:6BDL+F3DL 	r.   c                     t        d      NzNot needed for UniSpeechAttributeErrorra   s    r/   freeze_feature_extractorz'UniSpeechModel.freeze_feature_extractor3      788r.   c                     t        d      r   r   r   s    r/   freeze_feature_encoderz%UniSpeechModel.freeze_feature_encoder6  r   r.   audio)
checkpointoutput_typer   modalityexpected_outputNrn   r   mask_time_indicesoutput_attentionsoutput_hidden_statesreturn_dictreturnc                 
   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |      }|j                  dd      }|| j                  |j                  d   |      }| j                  |      \  }}| j                  |||      }| j                  |||||      }	|	d   }|s
||f|	dd  z   S t        |||	j                  |	j                        S )Nr   r   )r   r   r   r   r   r   r   )last_hidden_stateextract_featuresr$   r%   )r   r   r   use_return_dictr   	transposer   rO   r   _mask_hidden_statesr   UniSpeechBaseModelOutputr$   r%   )
ra   rn   r   r   r   r   r   r   r$   encoder_outputss
             r/   ri   zUniSpeechModel.forward9  s@   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]11,?+55a;%!DDEUE[E[\]E^`noN*.*A*ABR*S''00->~ 1 
 ,,)/!5# ' 
 (*!#34qr7JJJ'+-)77&11	
 	
r.   )NNNNN)r&   r'   r(   r   r   r   r   r   UNISPEECH_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r*   r   r+   r   r   r   ri   r-   r.   r/   r   r     s    
 "99 ++EF&,$. 269=,0/3&*-
u||,-
 !.-
 $E$5$56	-

 $D>-
 'tn-
 d^-
 
u..	/-
 G-
r.   r   zPUniSpeech Model with a vector-quantization module and ctc loss for pre-training.c                   H    e Zd Zdef fdZdefdZd Zd Ze		 dde
j                  de
j                  d	e
j                  defd
       Z ee       eee      	 	 	 	 ddee
j&                     dee
j&                     dee   dee   dee   deeef   fd              Z xZS )UniSpeechForPreTrainingr   c                 .   t         |   |       t        |      | _        t	        j
                  |j                        | _        t        |      | _	        t	        j                  |j                  |j                        | _        t	        j                  |j                  |j                        | _        t	        j                  |j                  |j                         | _        t	        j
                  |j$                        | _        | j)                          y )N)superr   r   rm   rT   Dropoutfeat_quantizer_dropoutdropout_featuresr=   	quantizerr   codevector_dimproj_codevector_dim	project_qrd   project_hidnum_ctc_classesctc_projfinal_dropoutdropoutr   )ra   r   	__class__s     r/   r   z UniSpeechForPreTraining.__init__u  s     '/ "

6+H+H I7?6#8#8&:T:TU99V%?%?ASAST		&"4"4f6L6LMzz&"6"67 	r.   rX   c                 &    || j                   _        y)zb
        Set the Gumbel softmax temperature to a given value. Only necessary for training
        N)r   rX   )ra   rX   s     r/   set_gumbel_temperaturez.UniSpeechForPreTraining.set_gumbel_temperature  s     &1"r.   c                 X    t        j                  dt               | j                          y)z
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        zThe method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.N)warningswarnFutureWarningr   r   s    r/   r   z0UniSpeechForPreTraining.freeze_feature_extractor  s'    
 	Q	

 	##%r.   c                 L    | j                   j                  j                          y)z
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)rm   r   _freeze_parametersr   s    r/   r   z.UniSpeechForPreTraining.freeze_feature_encoder  s    
 	((;;=r.   target_featuresnegative_featurespredicted_featuresc                     t        j                  | |gd      } t        j                  |j                         | j                         d      }|j	                  |       }||z  }|S )z
        Compute logits for contrastive loss based using cosine similarity as the distance measure between
        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
        r   r?   rA   )r*   catcosine_similarityrW   rY   )r   r   r   rX   logitss        r/   compute_contrastive_logitsz2UniSpeechForPreTraining.compute_contrastive_logits  sa      ))_6G$HaP(();)A)A)C_EZEZE\bde0 +%r.   )r   r   rn   r   r   r   r   r   c                    ||n| j                   j                  }| j                  |||||      }|d   }| j                  |d         }| j	                  |      \  }	}
| j                  |	j                  | j
                  j                  j                              }	| j                  |	      }	t        j                  |j                  d      |j                  d            j                  | j                   j                        }|j                  dd      }t        j                   |      j#                         j                  |j$                        }|j                  dd      }|j'                  d      }|j)                  |d      |	j)                  | d      z   }| j+                  |      }| j-                  |      }d}|s||||	|
f|dd z   S ||	|
f|dd z   S t/        |||	|
|j0                  |j2                        S )	a  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        sampled_negative_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*):
            Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss.
            Required input for pre-training.

        Returns:

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, UniSpeechForPreTraining

        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> # TODO: Add full pretraining example
        ```Nr   r   r   rA   rp   r   )r    r!   r"   r#   r$   r%   )r   r   rm   r   r   r   r   ru   r   r   r*   emptysizer   replace_probr   	bernoullir   r   r^   masked_fillr   r   r   r$   r%   )ra   rn   r   r   r   r   outputstransformer_featuresr   quantized_featuresr#   prob_replace_matrixsampled_replace_matrixr  r    s                  r/   ri   zUniSpeechForPreTraining.forward  s   > &1%<k$++B]B]..)/!5# ! 
  'qz  00<48NNCS4T11 "^^,>,A,A$..BWBWB]B],^_!--.@A#kk*>*C*CA*FH\HaHabcHdekkKK$$
 2;;AqA!&1D!E!J!J!L!O!OPdPkPk!l!7!A!A!Q!G!7!A!A"!E%112H#N**,B+BCH

 f%v& 24FH]^ahijikalll(*<>STW^_`_aWbbb,1'9"7!//))
 	
r.   )r   )NNNN)r&   r'   r(   r   r   r   r   r   r   rj   r*   r+   r  r   r   r   r   r   r   r   r   r   r   ri   __classcell__r   s   @r/   r   r   q  s    1# 1
&> 
 	** ,, "-- 	 & ++EF+HWfg 26,0/3&*M
u||,M
 !.M
 $D>	M

 'tnM
 d^M
 
u33	4M
 h GM
r.   r   zgUniSpeech Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).a/  
        target_lang (`str`, *optional*):
            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
            adapter.<lang>.bin. Only relevant when using an instance of [`UniSpeechForCTC`] with adapters. Uses 'eng'
            by default.
    c                   T     e Zd Z ee       eeeee	e
       fd              Z xZS )UniSpeechForCTC)r   r   r   r   expected_lossc                 $    t        |   di | y Nr-   r   ri   ra   super_kwargsr   s     r/   ri   zUniSpeechForCTC.forward  s     	','r.   )r&   r'   r(   r   r   r   r   r   r   _CTC_EXPECTED_OUTPUT_CTC_EXPECTED_LOSSri   r  r  s   @r/   r  r    s:     ++EF&"$,(( G(r.   r  z
    UniSpeech Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                   R     e Zd Z ee       eeeed       fd              Z	 xZ
S )"UniSpeechForSequenceClassificationr   )r   r   r   r   c                 $    t        |   di | y r  r  r  s     r/   ri   z*UniSpeechForSequenceClassification.forward"  s     	','r.   )r&   r'   r(   r   r   r   r   r
   r   ri   r  r  s   @r/   r  r    s7     ++EF&,$	( G(r.   r  )r  r   r  r   rl   )=r}   r   dataclassesr   typingr   r   r   r*   torch.nnrT   modeling_outputsr   r	   r
   r   modeling_utilsr   utilsr   r   r   r   r   wav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   r   r   configuration_unispeechr   
get_loggerr&   logger_HIDDEN_STATES_START_POSITIONr   r   r   r  r  r   r1   r5   r7   r9   r;   r=   rl   UNISPEECH_START_DOCSTRINGr   r   r   r   r  r  __all__r-   r.   r/   <module>r*     s     ! ) )   n n - 
 
 
 5 
		H	% !"  $ H '  q    :K  :  :F	'F 		4 		!: 		 		&D 	*'%B *'ZJ JZ $" J 3  iM
-} M
	M
` Z\uL
6 L
L
^ q	
(n 
(	
(  	()J 	(	(r.   