
    %	&h@                     2   d dl mZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZmZmZ ddlmZ ddlmZmZmZmZ d	d
lmZmZmZmZmZmZmZ ddlmZ dZ dZ!dZ"g dZ#dZ$dZ%dZ&dZ'dZ( G d dejR                        Z* G d de      Z+ G d de      Z, G d dejR                        Z- G d de      Z. G d d e      Z/ G d! d"e      Z0d#Z1d$Z2 ed%e1       G d& d'ee0             Z3 ed(e1       G d) d*e             Z4 ed+e1       G d, d-e             Z5g d.Z6y)/    )OptionalTupleUnionN   )ACT2FN)is_deepspeed_zero3_enabled)BaseModelOutputCausalLMOutputSequenceClassifierOutput)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardreplace_return_docstrings   )Wav2Vec2EncoderWav2Vec2EncoderStableLayerNormWav2Vec2FeatureEncoderWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2ModelWav2Vec2SamePadLayer   )HubertConfigr   zfacebook/hubert-large-ls960-ft)r   i$  i   z['MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'gGz6@zsuperb/hubert-base-superb-ksz'_unknown_'g(\!@c                   $     e Zd Z fdZd Z xZS )HubertPositionalConvEmbeddingc                    t         |           t        j                  |j                  |j                  |j
                  |j
                  dz  |j                        | _        d | _        |j                  r&t        j                  |j                        | _        nt        j                  j                  }t        t        j                  j                  d      r$t        j                  j                  j                  }t               r(dd l}|j"                  j%                  | j                  j&                  d      5   || j                  dd      | _        d d d        t        | j                  d      rU| j                  j                  j&                  j(                  }| j                  j                  j&                  j*                  }n,| j                  j,                  }| j                  j.                  }|j"                  j1                  | |       |j"                  j1                  | |       n || j                  dd      | _        t3        |j
                        | _        t6        |j8                     | _        y # 1 sw Y   'xY w)	Nr   )kernel_sizepaddinggroupsweight_normr   modifier_rankweight)namedimparametrizations)super__init__nnConv1dhidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsconv
batch_normconv_pos_batch_normBatchNorm1dutilsr!   hasattrr'   r   	deepspeedzeroGatheredParametersr$   	original0	original1weight_gweight_vregister_external_parameterHubertSamePadLayerr   r   feat_extract_activation
activation)selfconfigr!   r5   r:   r;   	__class__s         /var/www/pru.catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/hubert/modular_hubert.pyr)   z&HubertPositionalConvEmbedding.__init__0   s   II6622a777
	 %% nnV-?-?@DO((..Krxx00-@ hh77CC)+ ^^66tyy7G7GWX6Y M +DIIH! LDIM499&89#yy99@@JJH#yy99@@JJH#yy11H#yy11H::4J::4J'		aH	)&*H*HI !?!?@M Ms   ?I??J	c                     |j                  dd      }| j                  | j                  |      }| j                  |      }| j                  |      }| j	                  |      }|j                  dd      }|S )Nr   r   )	transposer0   r/   r   r?   r@   hidden_statess     rC   forwardz%HubertPositionalConvEmbedding.forwardU   sn    %//15??& OOM:M		-0]36%//15    __name__
__module____qualname__r)   rH   __classcell__rB   s   @rC   r   r   /   s    #AJ	rI   r   c                       e Zd Zy)r=   NrK   rL   rM    rI   rC   r=   r=   a       rI   r=   c                       e Zd Zy)HubertFeatureEncoderNrQ   rR   rI   rC   rU   rU   e   rS   rI   rU   c                   $     e Zd Z fdZd Z xZS )HubertFeatureProjectionc                 n   t         |           |j                  | _        | j                  r3t        j                  |j
                  d   |j                        | _        t        j                  |j
                  d   |j                        | _
        t        j                  |j                        | _        y )N)eps)r(   r)   feat_proj_layer_normr*   	LayerNormconv_dimlayer_norm_eps
layer_normLinearr,   
projectionDropoutfeat_proj_dropoutdropoutr@   rA   rB   s     rC   r)   z HubertFeatureProjection.__init__j   s}    $*$?$?!$$ ll6??2+>FDYDYZDO))FOOB$79K9KLzz&":":;rI   c                     | j                   r| j                  |      }| j                  |      }| j                  |      }|S )N)r[   r_   ra   rd   rF   s     rC   rH   zHubertFeatureProjection.forwardr   s;    $$ OOM:M6]3rI   rJ   rO   s   @rC   rW   rW   i   s    <rI   rW   c                       e Zd Zy)HubertEncoderNrQ   rR   rI   rC   rh   rh   {   rS   rI   rh   c                       e Zd Zy)HubertEncoderStableLayerNormNrQ   rR   rI   rC   rj   rj      rS   rI   rj   c                   |    e Zd ZdZeZdZdZdZdZ	dZ
d Zdeej                  ef   fdZded	ej                  fd
Zy)HubertPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    hubertinput_valuesTc                 z   t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                  t        j                  t        j                  f      rJ|j                  j                  j                          |j                  j                  j                  d       yt        |t        j                        r_t               rddl}t#        |d      r|t#        |d      rp|j$                  j'                  |j(                  |j*                  gd      5  t        j,                  j/                  |j                  j                         ddd       n|j$                  j'                  |j                  d      5  t        j,                  j/                  |j                  j                         ddd       n3t        j,                  j/                  |j                  j                         |j                  %|j                  j                  j                          yyt        |t0              r2t#        |d	      r%|j2                  j                  j5                          yyt        |t6              rMt#        |d
      r@|j8                  j                  j                  d| j                  j:                  dz   z         yyy# 1 sw Y   xY w# 1 sw Y   xY w)zInitialize the weights        )meanstdNg      ?r   r;   r:   r"   masked_spec_embedlayer_weightsr   )
isinstancer*   r`   r$   datanormal_rA   initializer_rangebiaszero_r\   	GroupNormr2   fill_r+   r   r5   r4   r6   r7   r;   r:   initkaiming_normal_HubertModelrs   uniform_HubertForSequenceClassificationrt   num_hidden_layers)r@   moduler5   s      rC   _init_weightsz#HubertPreTrainedModel._init_weights   sP   fbii( MM&&CT[[5R5R&S{{&  &&( 'r||R^^ LMKK""$MM$$S)		*)+ 6:.76:3N"::FOOV__;]mn:o D//0B0BCD D #::6==XY:Z D//0B0BCD D ''(:(:;{{&  &&( ',v23((--668 4 ?@v/$$))//t{{7T7TWX7X0YZ 0 AD DD Ds   ?4L%#4L1%L.1L:input_lengthsc                     d }t        | j                  j                  | j                  j                        D ]  \  }} ||||      } |S )zH
        Computes the output length of the convolutional layers
        c                 >    t        j                  | |z
  |d      dz   S )Nfloor)rounding_moder   )torchdiv)input_lengthr   strides      rC   _conv_out_lengthzPHubertPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length   s"     99\K7wWZ[[[rI   )ziprA   conv_kernelconv_stride)r@   r   r   r   r   s        rC    _get_feat_extract_output_lengthsz6HubertPreTrainedModel._get_feat_extract_output_lengths   sQ    
	\
 $'t{{'>'>@W@W#X 	QK,]KPM	Q rI   feature_vector_lengthattention_maskc                    | j                  |j                  d            j                  t        j                        }|j
                  d   }t        j                  ||f|j                  |j                        }d|t        j                  |j
                  d   |j                        |dz
  f<   |j                  dg      j                  d      j                  dg      j                         }|S )NrY   r   )dtypedevicer   )r   )r   sumtor   longshapezerosr   r   arangeflipcumsumbool)r@   r   r   output_lengths
batch_sizes        rC   "_get_feature_vector_attention_maskz8HubertPreTrainedModel._get_feature_vector_attention_mask   s    >>~?Q?QRT?UVYYZ_ZdZde#))!,
./~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOrI   N)rK   rL   rM   __doc__r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_2_supports_sdpar   r   r   
LongTensorintr   r   rR   rI   rC   rl   rl      sh    
  L $O&*#!N[BeEDTDTVYDY>Z 
 
]b]m]m 
rI   rl   a!  
    Hubert was proposed in [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden
    Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia,
    Ruslan Salakhutdinov, Abdelrahman Mohamed.

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving etc.).

    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`HubertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
            1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            <Tip warning={true}>

            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
            True`. For all models whose processor has `config.return_attention_mask == False`, such as
            [hubert-base](https://huggingface.co/facebook/hubert-base-ls960), `attention_mask` should **not** be passed
            to avoid degraded performance when doing batched inference. For such models `input_values` should simply be
            padded with 0 and passed without `attention_mask`. Be aware that these models also yield slightly different
            results depending on whether `input_values` is padded or not.

            </Tip>

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z`The bare Hubert Model transformer outputting raw hidden-states without any specific head on top.c                        e Zd Zdef fdZd Zd Z ee       e	e
e      	 	 	 	 	 ddeej                     deej                     deej                     d	ee   d
ee   dee   deee
f   fd              Z xZS )r   rA   c                    t         |   |       || _        t        |      | _        t        |      | _        |j                  dkD  s|j                  dkD  rEt        j                  t        j                  |j                        j                               | _        |j                   rt#        |      | _        nt'        |      | _        | j)                          | `y )Nrp   )r(   r)   rA   rU   feature_extractorrW   feature_projectionmask_time_probmask_feature_probr*   	Parameterr   Tensorr,   r   rs   do_stable_layer_normrj   encoderrh   	post_initadapterre   s     rC   r)   zHubertModel.__init__  s     !5f!="9&"A  3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"&&7?DL(0DL 	LrI   c                     t        d      NzNot needed for HubertAttributeErrorr@   s    rC   freeze_feature_extractorz$HubertModel.freeze_feature_extractor      455rI   c                     t        d      r   r   r   s    rC   freeze_feature_encoderz"HubertModel.freeze_feature_encoder!  r   rI   )output_typer   rn   r   mask_time_indicesoutput_attentionsoutput_hidden_statesreturn_dictreturnc                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |      }|j                  dd      }|| j                  |j                  d   |      }| j                  |      }| j                  ||      }| j                  |||||      }	|	d   }|s	|f|	dd z   S t        ||	j                  |	j                        S )aZ  

        Returns:

        Example:

        ```python
        >>> from transformers import AutoProcessor, HubertModel
        >>> from datasets import load_dataset
        >>> import soundfile as sf

        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
        >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")


        >>> def map_to_array(batch):
        ...     speech, _ = sf.read(batch["file"])
        ...     batch["speech"] = speech
        ...     return batch


        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.map(map_to_array)

        >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
        >>> hidden_states = model(input_values).last_hidden_state
        ```Nr   r   )r   )r   r   r   r   r   )last_hidden_staterG   
attentions)rA   r   r   use_return_dictr   rE   r   r   r   _mask_hidden_statesr   r	   rG   r   )
r@   rn   r   r   r   r   r   extract_featuresrG   encoder_outputss
             rC   rH   zHubertModel.forward$  s,   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]11,?+55a;%!DDEUE[E[\]E^`noN//0@A00Rc0d,,)/!5# ' 
 (*!#oab&999+)77&11
 	
rI   )NNNNN)rK   rL   rM   r   r)   r   r   r   HUBERT_INPUTS_DOCSTRINGr   r	   _CONFIG_FOR_DOCr   r   r   FloatTensorr   r   r   rH   rN   rO   s   @rC   r   r     s    
| &66 ++BC?Y 269=,0/3&*E
u||,E
 !.E
 $E$5$56	E

 $D>E
 'tnE
 d^E
 
uo%	&E
 Z DE
rI   r   zdHubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).c                   V     e Zd Z	  ee       eeeee	e
       fd              Z xZS )HubertForCTC)
checkpointr   r   expected_outputexpected_lossc                 $    t        |   di | y NrR   r(   rH   r@   super_kwargsrB   s     rC   rH   zHubertForCTC.forwardu  s     	','rI   )rK   rL   rM   r   r   r   _CHECKPOINT_FOR_DOCr
   r   _CTC_EXPECTED_OUTPUT_CTC_EXPECTED_LOSSrH   rN   rO   s   @rC   r   r   n  s=    
 	*+BC&"$,(( D(rI   r   z
    Hubert Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c            	       X     e Zd Z	  ee       eeeede	e
       fd              Z xZS )r   audio)r   r   r   modalityr   r   c                 $    t        |   di | y r   r   r   s     rC   rH   z'HubertForSequenceClassification.forward  s     	','rI   )rK   rL   rM   r   r   r   _SEQ_CLASS_CHECKPOINTr   r   _SEQ_CLASS_EXPECTED_OUTPUT_SEQ_CLASS_EXPECTED_LOSSrH   rN   rO   s   @rC   r   r     s@     	*+BC(,$2.( D(rI   r   )r   r   r   rl   )7typingr   r   r   r   torch.nnr*   activationsr   integrations.deepspeedr   modeling_outputsr	   r
   r   modeling_utilsr   r3   r   r   r   r   wav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   configuration_hubertr   _HIDDEN_STATES_START_POSITIONr   r   _EXPECTED_OUTPUT_SHAPEr   r   r   r   r   Moduler   r=   rU   rW   rh   rj   rl   HUBERT_START_DOCSTRINGr   r   r   r   __all__rR   rI   rC   <module>r      sx   ) )   ! @ Y Y -    / !"  ! 7 &  u   7 *  /BII /d	- 		1 	bii $	O 		#A 	GO GT &# L fa
-!6 a
	a
H n(> (	(  (&G ((  frI   