
    %	&h
                       d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z	d dl
mc mZ d dlZ	d dl	mZ d dlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+  e%       rd dl,m-Z- d dl.m/Z/ d dl0m1Z1 ne2Z/dZ3dZ4 e&jj                  e6      Z7 G d de      Z8	 	 dTde	jr                  de	jr                  dee	jr                     dee	jr                     dee	jr                  e	jr                  e	jr                  e:ee	jr                     ee	jr                     f   f
dZ;de	jr                  de	jr                  de:de:de	jr                  f
dZ< G d  d!e	jz                  j|                        Z?	 	 dTd"ee	jr                     d#ee:   fd$Z@ G d% d&e/      ZA G d' d(ej                        ZC G d) d*ej                        ZD G d+ d,e*      ZE	 dUd-d.d/e	jr                  de	jr                  d0e	jr                  dee	j                     d1ee:e:f   d2e:d3e:d4eeG   deee	jr                  e	jr                  f   ee	jr                     f   fd5ZHe	j                  fd-d.d/e	jr                  d6eAd"e	jr                  d#e:d1ee:e:f   d2e:d3e:d7e	j                  dee	jr                     fd8ZKd-d.d/e	jr                  de	jr                  d0e	jr                  dee	j                     d1ee:e:f   d2e:d3e:dee	jr                     fd9ZLeKeHeLd:ZM G d; d.ej                        ZN G d< d=ej                        ZOd>ZP e#d?eP       G d@ dAe              ZQdBZR e#d?eP       G dC dDeQ             ZS G dE dFej                        ZT e#dGeP       G dH dIeQ             ZU e#dJeP       G dK dLeQ             ZV e#dMeP       G dN dOeQ             ZW e#dPeP       G dQ dReQ             ZXg dSZYy)V    N)nullcontext)DictLiteralOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)PretrainedConfig)_prepare_4d_attention_mask)BaseModelOutputMaskedLMOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardis_flash_attn_2_availablelogging)is_triton_available   )GemmaRotaryEmbeddingapply_rotary_pos_emb) flash_attn_varlen_qkvpacked_func)RotaryEmbedding)apply_rotaryzanswerdotai/ModernBERT-baseModernBertConfigc                        e Zd ZdZdZdgZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dded   f fdZ fdZ xZ	S )	r#   a  
    This is the configuration class to store the configuration of a [`ModernBertModel`]. It is used to instantiate an ModernBert
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the ModernBERT-base.
    e.g. [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 50368):
            Vocabulary size of the ModernBert model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`ModernBertModel`]
        hidden_size (`int`, *optional*, defaults to 768):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 1152):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 22):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer decoder.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu"`
            if not specified.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_cutoff_factor (`float`, *optional*, defaults to 2.0):
            The cutoff factor for the truncated_normal_initializer for initializing all weight matrices.
        norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        norm_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the normalization layers.
        pad_token_id (`int`, *optional*, defaults to 50283):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 50282):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 50281):
            Beginning of stream token id.
        cls_token_id (`int`, *optional*, defaults to 50281):
            Classification token id.
        sep_token_id (`int`, *optional*, defaults to 50282):
            Separation token id.
        global_rope_theta (`float`, *optional*, defaults to 160000.0):
            The base period of the global RoPE embeddings.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        global_attn_every_n_layers (`int`, *optional*, defaults to 3):
            The number of layers between global attention layers.
        local_attention (`int`, *optional*, defaults to 128):
            The window size for local attention.
        local_rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the local RoPE embeddings.
        embedding_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the embeddings.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the MLP layers.
        mlp_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the MLP layers.
        decoder_bias (`bool`, *optional*, defaults to `True`):
            Whether to use bias in the decoder layers.
        classifier_pooling (`str`, *optional*, defaults to `"cls"`):
            The pooling method for the classifier. Should be either `"cls"` or `"mean"`. In local attention layers, the
            CLS token doesn't attend to all tokens on long sequences.
        classifier_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the classifier.
        classifier_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the classifier.
        classifier_activation (`str`, *optional*, defaults to `"gelu"`):
            The activation function for the classifier.
        deterministic_flash_attn (`bool`, *optional*, defaults to `False`):
            Whether to use deterministic flash attention. If `False`, inference will be faster but not deterministic.
        sparse_prediction (`bool`, *optional*, defaults to `False`):
            Whether to use sparse prediction for the masked language model instead of returning the full dense logits.
        sparse_pred_ignore_index (`int`, *optional*, defaults to -100):
            The index to ignore for the sparse prediction.
        reference_compile (`bool`, *optional*):
            Whether to compile the layers of the model which were compiled during pretraining. If `None`, then parts of
            the model will be compiled if 1) `triton` is installed, 2) the model is not on MPS, 3) the model is not
            shared between devices, and 4) the model is not resized after initialization. If `True`, then the model may
            be faster in some scenarios.
        repad_logits_with_grad (`bool`, *optional*, defaults to `False`):
            When True, ModernBertForMaskedLM keeps track of the logits' gradient when repadding for output. This only
            applies when using Flash Attention 2 with passed labels. Otherwise output logits always have a gradient.

    Examples:

    ```python
    >>> from transformers import ModernBertModel, ModernBertConfig

    >>> # Initializing a ModernBert style configuration
    >>> configuration = ModernBertConfig()

    >>> # Initializing a model from the modernbert-base style configuration
    >>> model = ModernBertModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
modernbertpast_key_valuesclassifier_poolingclsmeanc$           	      "   t        %|   d|||||d|$ || _        || _        || _        || _        || _        || _        || _        |	| _	        |
| _
        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        | | _        |!| _        |"| _        |#| _        | j.                  dvrtA        d| j.                   d      y )N)pad_token_idbos_token_ideos_token_idcls_token_idsep_token_idr(   zQInvalid value for `classifier_pooling`, should be either "cls" or "mean", but is . )!super__init__
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsinitializer_rangeinitializer_cutoff_factornorm_eps	norm_biasglobal_rope_thetaattention_biasattention_dropouthidden_activationglobal_attn_every_n_layerslocal_attentionlocal_rope_thetaembedding_dropoutmlp_biasmlp_dropoutdecoder_biasr'   classifier_dropoutclassifier_biasclassifier_activationdeterministic_flash_attnsparse_predictionsparse_pred_ignore_indexreference_compilerepad_logits_with_grad
ValueError)&selfr5   r7   r8   r9   r:   rB   r6   r;   r<   r=   r>   r,   r.   r-   r/   r0   r?   r@   rA   rC   rD   rE   rF   rG   rH   rI   r'   rJ   rK   rL   rM   rN   rO   rP   rQ   kwargs	__class__s&                                        /var/www/pru.catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/modernbert/modular_modernbert.pyr4   zModernBertConfig.__init__   sT   N 	 	
%%%%%	
 	
 %'>$&!2!2#6 !2)B& "!2,!2!2*D'. 0!2 &("4"4.%:"(@%!2(@%!2&<#""/9cdhd{d{c||}~  :    c                 H    t         |          }|j                  dd        |S )NrP   )r3   to_dictpop)rS   outputrU   s     rV   rY   zModernBertConfig.to_dict   s#    "

&-rW   )#i  i   i        gelui    g{Gz?       @gh㈵>Fik  j  i  ra   r`   g     AF        r           @rb   Frb   Tr)   rb   Fr^   FFiNF)
__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencer   r4   rY   __classcell__rU   s   @rV   r#   r#   =   s    eN J#4"5   $"%"#$ 5:$!&!%$IQ8 $M29Qf rW   inputsattention_maskposition_idslabelsreturnc                    |j                  dt        j                        }t        j                  |j	                         d      j	                         }t        |j                         j                               }t        j                  j                  j                  t        j                  |dt        j                        d      }| j                         dk(  r| j	                         |   }n*| j                  ^}	}
}|	|
z  } | j                  |g| |   }||j	                         |   nd}||j	                         |   nd}||||||fS )	a  
    Remove padding from input sequences.

    Args:
        inputs: (batch, seqlen, ...) or (batch, seqlen)
        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
        position_ids: (batch, seqlen), int, position ids
        labels: (batch, seqlen), int, labels

    Returns:
        unpadded_inputs: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask.
        indices: (total_nnz)
        cu_seqlens: (batch + 1), the cumulative sequence lengths
        max_seqlen_in_batch: int
        unpadded_position_ids: (total_nnz) or None
        unpadded_labels: (total_nnz) or None
    dimdtypeF)as_tupler   )   r   r   N)sumtorchint32nonzeroflattenintmaxitemr	   
functionalpadcumsumru   shapeview)rm   rn   ro   rp   seqlens_in_batchindicesmax_seqlen_in_batch
cu_seqlensunpadded_inputsbatchseqlenrestr   unpadded_position_idsunpadded_labelss                  rV   _unpad_modernbert_inputr     s,   . &))b)DmmN224uEMMOG.22499;<$$((6FAUZU`U`)acijJzz|q ..*73%||v%&++e3d3G<?K?WL0027;]a393Efnn&w/4OGZ1DF[]lllrW   r   r   r   c                 l   | j                         dk(  rHt        j                  ||z  | j                  | j                        }| ||<   |j                  ||      }|S | j                  ^}}t        j                  ||z  g|| j                  | j                  d}| ||<    |j
                  ||g| }|S )aQ  
    Add padding to sequences.

    Args:
        inputs: (total_nnz, ...) or (total_nnz,), where total_nnz = number of tokens selected in attention_mask.
        indices: (total_nnz)
        batch: int, batch size
        seqlen: int, max sequence length

    Returns:
        padded_inputs: (batch, seqlen, ...) or (batch, seqlen)
    rx   )rv   device)ru   rz   zerosrv   r   r   r   )rm   r   r   r   r[   padded_inputs_r   s           rV   _pad_modernbert_outputr   *  s    $ zz|qUV^6<<V wE62  <<DUV^]d]&,,v}}] w#E69D9rW   c                   \    e Zd Ze	 	 ddeej                     dee   fd       Zed        Z	y)ApplyRotaryEmbUnpadNr   
max_seqlenc           
          |j                         }|j                  \  }}}}	|d d d df   j                  |d|	      }
t        |
||d||dd       | j	                  |||       || _        |S )Nr   rs   r   FT)seqlen_offsetsr   r   interleavedinplace)
contiguousr   r   r"   save_for_backwardr   )ctxqkvcossinr   r   	total_nnz_three_nheadsheaddimqks              rV   forwardzApplyRotaryEmbUnpad.forwardJ  s     nn.1ii+	67G BQBZ__YG4!!		
 	c3
3#
rW   c                     | j                   \  }}}|j                         }|j                  \  }}}}|d d d df   j                  |d|      }	t	        |	||d|| j
                  ddd	       |d d d d d d fS )Nr   rs   r   FT)r   r   r   r   r   	conjugate)saved_tensorsr   r   r   r"   r   )
r   dor   r   r   r   r   r   r   dqks
             rV   backwardzApplyRotaryEmbUnpad.backwardi  s    "00S*]]_.0hh+	67G BQBinnYG4!~~
	
 4tT455rW   NN)
re   rf   rg   staticmethodr   rz   Tensorr~   r   r   r2   rW   rV   r   r   I  sQ     .2$(
 U\\* SM < 6 6rW   r   r   r   c                 4    t         j                  | ||||      S )a  
    Arguments:
        qkv: (total_nnz, 3, nheads, headdim) - input tensor for packed QKV.
        cos, sin: (seqlen_rotary, rotary_dim / 2)
        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
            of 1st half and 2nd half (GPT-NeoX style).
        inplace: if True, apply rotary embedding in-place.
        seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
            Most commonly used in inference when we have KV cache.
        cu_seqlens: (batch + 1,) or None
        max_seqlen: int
    Return:
        out: (total_nnz, dim)
    rotary_dim must be <= headdim
    Apply rotary embedding to the first rotary_dim of x.
    )r   apply)r   r   r   r   r   s        rV   apply_rotary_unpaddedr     s    . $$S#sJ
KKrW   c                   "    e Zd ZdZ	 	 	 	 ddededee   deej                     deej                     f
 fdZ
	 ddej                  d	ej                  dee   d
eej                  eej                  ej                  f   f   fdZd
efdZ xZS )!ModernBertUnpaddedRotaryEmbeddingzP
    The rotary position embeddings applied directly to unpadded sequences.
    ru   baser   r   rv   c                 v    t         |   ||d|d       || _        |||| j                  |||       yyyy)a  
        max_seqlen: if max_seqlen, device, and dtype are provided, we precompute the cos_sin_cache
            up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
            the cos_sin_cache wll be recomputed during the forward pass.
        TF)ru   r   pos_idx_in_fp32r   r   Nr   rv   )r3   r4   r   _update_cos_sin_cache)rS   ru   r   r   r   rv   rU   s         rV   r4   z*ModernBertUnpaddedRotaryEmbedding.__init__  sV     	StT&^cd$!f&8U=N&&z&&N >O&8!rW   r   r   rq   c                     |(| j                  ||j                  |j                         t        || j                  | j
                  ||      }|S )z
        Apply rotary embedding *inplace* to qkv.
        qkv: (total_nnz, 3, nheads, headdim)
        cu_seqlens: (batch + 1,) cumulative sequence lengths
        max_seqlen: int max seq length in the batch
        r   r   r   )r   r   rv   r   _cos_cached_sin_cached)rS   r   r   r   s       rV   r   z)ModernBertUnpaddedRotaryEmbedding.forward  sS     !&&z#**CII&V#!!
 
rW   c                 T    d| j                    d| j                   d| j                   S )Nzdim=z, base=z, scale_base=)ru   r   
scale_baserS   s    rV   
extra_reprz,ModernBertUnpaddedRotaryEmbedding.extra_repr  s(    dhhZwtyykt>OPPrW   )rd   NNNN)re   rf   rg   rh   r~   floatr   rz   r   rv   r4   r   r   r   r   strr   rk   rl   s   @rV   r   r     s     $()-'+OO O SM	O
 &O $O. %)	\\ LL SM	
 
u||U5<<#=>>	?2QC QrW   r   c                        e Zd ZdZdef fdZ ej                  d      dej                  dej                  fd       Z
	 ddeej                     d	eej                     dej                  fd
Z xZS )ModernBertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    configc                 d   t         |           || _        t        j                  |j
                  |j                  |j                        | _        t        j                  |j                  |j                  |j                        | _        t        j                  |j                        | _        y )N)padding_idxepsbias)r3   r4   r   r	   	Embeddingr5   r7   r,   tok_embeddings	LayerNormr=   r>   normDropoutrF   droprS   r   rU   s     rV   r4   zModernBertEmbeddings.__init__  sw     ll6+<+<f>P>P^d^q^qrLL!3!3vO_O_`	JJv778	rW   Tdynamic	input_idsrq   c                 `    | j                  | j                  | j                  |                  S r   )r   r   r   )rS   r   s     rV   compiled_embeddingsz(ModernBertEmbeddings.compiled_embeddings  s%    yy4#6#6y#ABCCrW   inputs_embedsc                     |"| j                  | j                  |            }|S | j                  j                  r| j	                  |      n.| j                  | j                  | j                  |                  }|S r   )r   r   r   rP   r   r   )rS   r   r   hidden_statess       rV   r   zModernBertEmbeddings.forward  su     $ IIdii&>?M  ;;00 ((3YYtyy)<)<Y)GHI 
 rW   r   )re   rf   rg   rh   r#   r4   rz   compile
LongTensorr   r   r   r   rk   rl   s   @rV   r   r     s    9/ 9 U]]4 DU-=-= D%,, D !D ei!%"2"23KSTYT`T`Ka	rW   r   c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )ModernBertMLPa6  Applies the GLU at the end of each ModernBERT layer.

    Compared to the default BERT architecture, this block replaces :class:`~transformers.model.bert.modeling_bert.BertIntermediate`
    and :class:`~transformers.model.bert.modeling_bert.SelfOutput` with a single module that has similar functionality.
    r   c                    t         |           || _        t        j                  |j
                  t        |j                        dz  |j                        | _	        t        |j                     | _        t        j                  |j                        | _        t        j                  |j                  |j
                  |j                        | _        y )Nr   r   )r3   r4   r   r	   Linearr7   r~   r8   rG   Wir   rB   actr   rH   r   Wor   s     rV   r4   zModernBertMLP.__init__  s    ))F..F4L4L0MPQ0QX^XgXgh&223JJv112	))F44f6H6Hv_rW   r   rq   c                     | j                  |      j                  dd      \  }}| j                  | j                  | j	                  |      |z              S )Nr   rs   ru   )r   chunkr   r   r   )rS   r   inputgates       rV   r   zModernBertMLP.forward  sI    ggm,221"2=twwtyy%4!7899rW   )
re   rf   rg   rh   r#   r4   rz   r   r   rk   rl   s   @rV   r   r     s2    `/ `:U\\ :ell :rW   r   c            
       L     e Zd Zddedededeej                     f fdZ	 xZ
S )ModernBertRotaryEmbeddingr   ru   r   r   c                 d    t         |   | ||       | j                  d |||      \  }| _        y )N)r   r   )ru   r   )r3   r4   rope_init_fnattention_scaling)rS   r   ru   r   r   inv_freqrU   s         rV   r4   z"ModernBertRotaryEmbedding.__init__  s9    fV<+/+<+<T6sY]+<+^($(rW   r   )re   rf   rg   r#   r~   r   r   rz   r   r4   rk   rl   s   @rV   r   r     s=    _/ _c _ _PXY^YeYePf _ _rW   r   moduleModernBertAttentionr   sliding_window_maskrD   bsru   output_attentionsc	                    | j                  ||      \  }
}|j                  dd      j                  d      \  }}}t        |||
|      \  }}| j                  dz  }t        j                  ||j                  dd            |z  }|dk7  r|}||z   }t        j                  j                  |dt
        j                  	      j                  |j                        }t        j                  j                  || j                  | j                  
      }t        j                  ||      }|j                  dd      j!                         }|j#                  |d|      }|r||fS |fS )Nro   r   rx   r   r         ࿩rs   rs   rs   rt   )ptraining)
rotary_emb	transposeunbindr   head_dimrz   matmulr	   r   softmaxfloat32torv   dropoutrA   r   r   r   )r   r   rn   r   ro   rD   r   ru   r   _kwargsr   r   querykeyvaluescaleattn_weightsattn_outputs                     rV   eager_attention_forwardr    sK      < @HCa+22q29E3%eS#s;JE3OOT!E<<s}}Q':;eCL(",.0L ==((2U]](SVVW\WbWbcL==((9Q9Q\b\k\k(lL,,|U3K''1-88:K""2r3/K\**>rW   r   target_dtypec	                     ||||      }|j                   t        j                  t        j                  fv}
|
rb|j                   }|j	                  |      }t        |||| j                  r| j                  nd| j                  |      }|j	                  |      }n3t        |||| j                  r| j                  nd| j                  |      }|j                  ||      fS )Nr   rb   )r   r   	dropout_pdeterministicwindow_size)
rv   rz   float16bfloat16r  r    r   rA   rM   r   )r   r   r   r   r   rD   r   ru   r  r  convert_dtype
orig_dtypeattns                rV   flash_attention_forwardr  ,  s     SZJ
GCIIemmU^^%DDM YY
ff\"/!!28//f..s 99'
 wwz"/!!28//f..s 99'
 IIb#  rW   c                 v   | j                  ||      \  }	}
|j                  dd      j                  d      \  }}}t        |||	|
      \  }}|dk7  r|}t	        j
                  |||| j                  r| j                  nd|      j                  dd      j                         }|j                  |d	|      }|fS )
Nr   r   rx   r   r   r   rb   )r  	attn_maskrs   )
r   r   r  r   Fscaled_dot_product_attentionr   rA   r   r   )r   r   rn   r   ro   rD   r   ru   r  r   r   r	  r
  r  r  s                  rV   sdpa_attention_forwardr  W  s       < @HCa+22q29E3%eS#s;JE3(", 	
&&28//f..s$	
 
1a	  ""2r3/K>rW   )flash_attention_2eagersdpac                   z     e Zd ZdZd	dedee   f fdZ	 d
dej                  dee
   dej                  fdZ xZS )r   a  Performs multi-headed self attention on a batch of unpadded sequences.

    If Flash Attention 2 is installed, this module uses Flash Attention to improve throughput.
    If Flash Attention 2 is not installed, the implementation will use PyTorch's SDPA kernel,
    which requires padding and unpadding inputs, adding some overhead.

    See `forward` method for additional details.
    r   layer_idc                    t         |           || _        || _        |j                  |j
                  z  dk7  r&t        d|j                   d|j
                   d      |j                  | _        |j                  | _        |j
                  | _	        |j                  |j
                  z  | _
        | j                  | j                  z  | _        t        j                  |j                  d| j                  z  |j                        | _        ||j                   z  dk7  r$|j"                  dz  |j"                  dz  f| _        nd| _        |j$                  }|j&                  }| j"                  dk7  r$|j(                  |j(                  }|j"                  }|j*                  d	k(  rt-        | j                  ||
      | _        nt1        || j                  |      | _        t        j                  |j                  |j                  |j                        | _        |j                  dkD  rt        j4                  |j                        nt        j6                         | _        t;               | _        y )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   r   r   r   r   )ru   r   r   )r   ru   r   rb   )r3   r4   r   r$  r7   r:   rR   rA   rM   	num_headsr  all_head_sizer	   r   r@   WqkvrC   rD   r?   r6   rE   _attn_implementationr   r   r   r   r   Identityout_dropsetpruned_heads)rS   r   r$  
rope_thetar6   rU   s        rV   r4   zModernBertAttention.__init__  s      : ::a?#F$6$6#77mnt  oI  oI  nJ  JK  L  "(!9!9(.(G(G%33**f.H.HH!]]T^^;IIf00!d6H6H2HvOdOde	f7771<$*$:$:a$?AWAW[\A\#]D #+D --
"("@"@8+&&2#44
&,&<&<#&&*==?MM.EJDO 8v4==_ijDO))F..0B0BI^I^_@F@X@X[^@^

6#;#;<dfdododqErW   r   r   rq   c           
         | j                  |      }|j                  d   }| j                  j                  dk(  r)|j	                  dd| j
                  | j                        }n)|j	                  |dd| j
                  | j                        }t        | j                  j                     | f|| j                  | j                  || j                  |d|}|d   }| j                  | j                  |            }|f|dd  z   S )Nr   r   rs   r   )r   r   rD   r   ru   r   rx   )r)  r   r   r*  r   r'  r  MODERNBERT_ATTENTION_FUNCTIONr   rD   r(  r,  r   )rS   r   r   rT   r   r   attn_outputss          rV   r   zModernBertAttention.forward  s     ii&  #;;++/BB((2q$..$--@C((2r1dnndmmDC4T[[5U5UV	
 00""/	
 	
 %Qdggm&<=,qr"222rW   r   F)re   rf   rg   rh   r#   r   r~   r4   rz   r   boolr   rk   rl   s   @rV   r   r     sS    &"/ &"8C= &"V -23||3 $D>3
 
3rW   c                   f    e Zd Zddedee   f fdZ ej                  d      dej                  dej                  fd       Z
	 	 	 	 	 	 ddej                  d	eej                     d
eej                     deej                     deej                     dee   dee   dej                  fdZ xZS )ModernBertEncoderLayerr   r$  c                    t         |           || _        |dk(  rt        j                         | _        n;t        j                  |j                  |j                  |j                        | _        t        ||      | _        t        j                  |j                  |j                  |j                        | _        t        |      | _        y )Nr   r   )r   r$  )r3   r4   r   r	   r+  	attn_normr   r7   r=   r>   r   r  mlp_normr   mlprS   r   r$  rU   s      rV   r4   zModernBertEncoderLayer.__init__  s    q=[[]DN\\&*<*<&//X^XhXhiDN'vI	V%7%7V__SYScScd (rW   Tr   r   rq   c                 B    | j                  | j                  |            S r   )r:  r9  rS   r   s     rV   compiled_mlpz#ModernBertEncoderLayer.compiled_mlp  s    xxm455rW   rn   r   ro   r   r   r   c           	      
   | j                  | j                  |      ||||||      }||d   z   }| j                  j                  r| j	                  |      n| j                  | j                  |            }	||	z   }|f|dd  z   S )Nrn   r   ro   r   r   r   r   rx   )r  r8  r   rP   r>  r:  r9  )
rS   r   rn   r   ro   r   r   r   r2  
mlp_outputs
             rV   r   zModernBertEncoderLayer.forward  s     yyNN=)) 3%!!/ ! 
 &Q7 {{,, m,$--67 	
 &
2,qr"222rW   r   )NNNNNF)re   rf   rg   r#   r   r~   r4   rz   r   r   r>  r   r4  r   rk   rl   s   @rV   r6  r6    s    	)/ 	)8C= 	) U]]4 6%,, 65<< 6 !6 266:37-1$(,13||3 !.3 &ell3	3
 u//03 U\\*3 SM3 $D>3 
3rW   r6  aO  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`ModernBertConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
zXThe bare ModernBert Model outputting raw hidden-states without any specific head on top.c                        e Zd ZeZdZdZddgZdZdZ	dZ
dej                  fdZe	 	 	 	 dded	eej$                     d
eeeeeef   f      def fd       Zd Z fdZ xZS )ModernBertPreTrainedModelmodelTr   r6  Fr   c                    | j                   j                  ddt        j                  dt        ffd}| j                   j
                  | j                   j
                  t        j                  d| j                   j                  z        z  | j                   j
                  | j                   j                  dz  d}t        |t              r ||j                  |d          y t        |t              r- ||j                  |d	           ||j                  |d
          y t        |t               r- ||j"                  |d	           ||j                  |d
          y t        |t$              r ||j&                  |d
          y t        |t(              r ||j*                  |d
          y t        |t,        t.        t0        f      r ||j2                  |d          y t        |t        j4                        rW|j6                  j8                  j;                  d       |j<                  %|j<                  j8                  j?                          y y y )Nr   r   stdc                    t         j                  j                  | j                  d| |z  |z         t	        | t         j
                        r7| j                  *t         j                  j                  | j                         y y y )Nrb   )r*   rF  ab)r	   inittrunc_normal_weight
isinstancer   r   zeros_)r   rF  cutoff_factors     rV   init_weightz<ModernBertPreTrainedModel._init_weights.<locals>.init_weight"  sq    GG!! .3&#% "  &")),;;*GGNN6;;/ + -rW   r_   r   )inout	embedding	final_outrS  rQ  rR  rT  g      ?) r   r<   r	   Moduler   r;   mathsqrtr9   r7   rM  r   r   r   r   r   r   r)  ModernBertPredictionHeaddenseModernBertForMaskedLMdecoder#ModernBertForSequenceClassification ModernBertForTokenClassificationModernBertForQuestionAnswering
classifierr   rL  datafill_r   zero_)rS   r   rP  stdsrO  s       @rV   _init_weightsz'ModernBertPreTrainedModel._init_weights  s   == M	0		 	0 	0 ++//;;00499S4;;C`C`=`3aa6600$6	
 f23--tK/@A.		4:.		4;/ 34T$Z0		4;/ 89d5k2 56U402RTrs
 ))4+<=-MM$$S){{&  &&( ' .rW   use_flash_attention_2torch_dtype
device_mapcheck_device_mapc                     |j                   ,d|_         	 | j                  |t        j                  |d|      S t        |   ||t        j                  ||      S # t        t
        f$ r
 d |_         Y :w xY w)Nr   F)rf  rg  hard_check_onlyrh  )re  rf  rg  rh  )_attn_implementation_internal_check_and_enable_flash_attn_2rz   r  rR   ImportErrorr3   _autoset_attn_implementation)r)   r   re  rf  rg  rh  rU   s         rV   rn  z6ModernBertPreTrainedModel._autoset_attn_implementationL  s     //73FF0	<99 %)$)%5 :   w3"7!- 4 
 	
 , <7;4<s   #A A54A5c                    | j                   j                  du ry t        | d      rTt        | j                        dkD  r<| j                   j                  rt
        j                  d       d| j                   _        | j                  j                  dk(  r<| j                   j                  rt
        j                  d       d| j                   _        | j                  j                  dk(  r<| j                   j                  rt
        j                  d       d| j                   _        | j                   j                  t               | j                   _        y y )	NFhf_device_maprx   zqIf `accelerate` split the model across devices, `torch.compile` will not work. Falling back to non-compiled mode.mpsz|Compiling the model with `torch.compile` and using a `torch.mps` device is not supported. Falling back to non-compiled mode.cpuz|Compiling the model with `torch.compile` and using a `torch.cpu` device is not supported. Falling back to non-compiled mode.)
r   rP   hasattrlenrp  loggerwarning_oncer   typer   r   s    rV   _maybe_set_compilez,ModernBertPreTrainedModel._maybe_set_compilem  s   ;;((E14)c$2D2D.E.I{{,,##9 -2DKK);;u${{,,##9 -2DKK);;u${{,,##9 -2DKK);;((0,?,ADKK) 1rW   c                     t        |   |i |}| j                  j                  dv r<| j                  j                  rt        j                  d       d| j                  _        |S )N>   NTzcResizing token embeddings with `torch.compile` is not supported. Falling back to non-compiled mode.F)r3   resize_token_embeddingsr   rP   ru  rv  )rS   argsrT   model_embedsrU   s       rV   rz  z1ModernBertPreTrainedModel.resize_token_embeddings  s[    w6GG;;((L8{{,,##y -2DKK)rW   )FNNT)re   rf   rg   r#   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_2_supports_sdpa_supports_flex_attnr	   rU  rd  classmethodr4  r   rz   rv   r   r   r   r~   rn  rx  rz  rk   rl   s   @rV   rC  rC    s    
 $L&*#/1IJ!N-)BII -)^  ',-1;?!%
  $
 ekk*	

 U3S#X#678
 
 
@B>
 
rW   rC  a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. With Flash Attention 2.0, padding will be ignored
            by default should you provide it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c            "           e Zd Zdef fdZd Zd Z ee       e	e
ee      	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                      deej                      d	eej                     d
eej                      deej                      deej                      dee   dee   dee   dee   dee   dee   deeej                   df   ef   fd              Zdej                   dedej                   fdZ xZS )ModernBertModelr   c           	         t         |   |       || _        t        |      | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _
        t        j                  |j                  |j                  |j                        | _        d| _        | j#                          y c c}w )Nr   F)r3   r4   r   r   
embeddingsr	   
ModuleListranger9   r6  layersr   r7   r=   r>   
final_normgradient_checkpointing	post_initr;  s      rV   r4   zModernBertModel.__init__  s     .v6mmFKFLdLdFef(#FH5f
 ,,v'9'9vU[UeUef&+#	 gs   C c                 .    | j                   j                  S r   r  r   r   s    rV   get_input_embeddingsz$ModernBertModel.get_input_embeddings  s    ---rW   c                 &    || j                   _        y r   r  )rS   r  s     rV   set_input_embeddingsz$ModernBertModel.set_input_embeddings  s    ).&rW   
checkpointoutput_typer}  r   rn   r   ro   r   r   r   r   
batch_sizeseq_lenr   output_hidden_statesreturn_dictrq   .c                 j  	
 ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt	        d      |rdnd }|rdnd }| j                          || j                  ||       	)
'||j                  d d \  	
n|j                  d d \  	
||j                  n|j                  }|(t        j                  	
f|t        j                        }d}| j                   j                  dk(  rM||d}|0t        j                         5  t        ||      ^}}}}d d d        nQt        ||      ^}}}}n>|&t        j                  
|	      j!                  d
      }| j#                  ||      \  }}| j%                  ||      }| j&                  D ]t  }|r||fz   }| j(                  r/| j*                  r#| j-                  |j.                  |||||||      }n ||||||||      }|d
   }|s]t1        |      dkD  sl||d   fz   }v |r||fz   }| j3                  |      }|r't5        |	
      }|t7        	
fd|D              }|st7        d |||fD              S t9        |||      S # 1 sw Y   xY w)Nz:You must specify exactly one of input_ids or inputs_embedsr2   r   r   Fr   T)rm   rn   )r   r   )r   )r   r   r@  rx   rm   r   r   r   c              3   <   K   | ]  }t        |         yw)r  N)r   ).0hsr  r   r  s     rV   	<genexpr>z*ModernBertModel.forward.<locals>.<genexpr>[  s(      * +"gZ`ghh*s   c              3   &   K   | ]	  }||  y wr   r2   )r  vs     rV   r  z*ModernBertModel.forward.<locals>.<genexpr>a  s     mq_`_lms   )last_hidden_stater   
attentions)r   r   r  use_return_dictrR   rx  %warn_if_padding_and_no_attention_maskr   r   rz   onesr4  r*  no_gradr   arange	unsqueeze_update_attention_maskr  r  r  r   _gradient_checkpointing_func__call__rt  r  r   tupler   )rS   r   rn   r   ro   r   r   r   r   r  r  r   r  r  all_hidden_statesall_self_attentionsr   repadr   r   encoder_layerlayer_outputss         `  ``           rV   r   zModernBertModel.forward  sG   , 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]-t";<YZZ"6BD$5b4! 66y.Q'/(&3&9&9"1&=#
G&/oobq&9#
G%.%:!!@T@T!"ZZW(=fTYT^T^_N;;++/BB:#5*:L ( I`#,^JF	7J
Q 
 Ja,^JFM7J
Q #$||GFCMMaP262M2M2C 3N 3/N/ )=Y![[ 	PM#$58H$H!**t}} $ A A!**!"' %	! !.!#1(;!-))&7! *!,M S%7!%;&9]1=M<O&O#7	P:   1]4D D62$gZPWM !,$) */* %!
 m]4EGZ$[mmm++*
 	
A s   >J((J2c                     |r| j                   j                  dk(  r't        j                  d       d| j                   _        nF| j                   j                  dk7  r-t        j                  d| j                   j                   d       t	        || j
                        }t        j                  |j                  d         j                  d      }t        j                  ||j                  z
        }|| j                   j                  dz  k  j                  d      j                  d      j                  |j                        }|j                  |j!                         t        j"                  | j
                        j$                        }||fS )Nr"  zOutputting attentions is only supported with the 'eager' attention implementation, not with "sdpa". Falling back to `attn_implementation="eager"`.r!  zZOutputting attentions is only supported with the eager attention implementation, not with zT. Consider setting `attn_implementation="eager"`. Setting `output_attentions=False`.r   r   )r   r*  ru  rv  r   rv   rz   r  r   r  absTrD   r  r   masked_filllogical_notfinfomin)rS   rn   r   global_attention_maskrowsdistancewindow_maskr   s           rV   r  z&ModernBertModel._update_attention_maskh  sS   {{//69##V 4;011W<##  $ @ @A B:: !;>4:: V ||177:;EEaH99TDFF]+ 4499DDQGQQRSTWWXfXmXmn 	 4??@W@W@Y[`[f[fgkgqgq[r[v[vw$&999rW   NNNNNNNNNNNNN)re   rf   rg   r#   r4   r  r  r   MODERNBERT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   rz   r   r   r~   r4  r   r   r   r  rk   rl   s   @rV   r  r    s   
	/ 	./ ++FG&#$ 15156:3704*.-1$($(!%,0/3&*t
E,,-t
 !.t
 &ell3	t

 u//0t
  -t
 %,,'t
 U\\*t
 SMt
 SMt
 #t
 $D>t
 'tnt
 d^t
 
uU\\3&'8	9t
 Ht
l:U\\ :VZ :_d_k_k :rW   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )rX  r   c                 J   t         |           || _        t        j                  |j
                  |j
                  |j                        | _        t        |j                     | _
        t        j                  |j
                  |j                  |j                        | _        y )Nr   )r3   r4   r   r	   r   r7   rK   rY  r   rL   r   r   r=   r>   r   r   s     rV   r4   z!ModernBertPredictionHead.__init__  sq    YYv1163E3EvG]G]^
&667LL!3!3vO_O_`	rW   r   rq   c                 `    | j                  | j                  | j                  |                  S r   )r   r   rY  r=  s     rV   r   z ModernBertPredictionHead.forward  s#    yy$**]";<==rW   )	re   rf   rg   r#   r4   rz   r   r   rk   rl   s   @rV   rX  rX    s-    a/ a>U\\ >ell >rW   rX  zZThe ModernBert Model with a decoder head on top that is used for masked language modeling.c            #       `    e Zd ZdgZdef fdZd Zdej                  fdZ	 e
j                  d      d	e
j                  d
e
j                  fd       Z ee       eeee      	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee
j*                     dee
j                     dee
j                     dee
j                     dee
j                     dee
j                     dee
j                     dee
j                     dee   dee   dee   dee   dee   dee   d
eee
j                     ef   fd              Z xZS )rZ  zdecoder.weightr   c                 t   t         |   |       || _        t        |      | _        t        |      | _        t        j                  |j                  |j                  |j                        | _        | j                  j                  | _        | j                  j                  | _        | j                          y )Nr   )r3   r4   r   r  rD  rX  headr	   r   r7   r5   rI   r[  rN   rO   r  r   s     rV   r4   zModernBertForMaskedLM.__init__  s     $V,
,V4	yy!3!3V5F5FVM`M`a!%!>!>(,(L(L% 	rW   c                     | j                   S r   r[  r   s    rV   get_output_embeddingsz+ModernBertForMaskedLM.get_output_embeddings  s    ||rW   new_embeddingsc                     || _         y r   r  )rS   r  s     rV   set_output_embeddingsz+ModernBertForMaskedLM.set_output_embeddings  s	    %rW   Tr   r[   rq   c                 B    | j                  | j                  |            S r   )r[  r  )rS   r[   s     rV   compiled_headz#ModernBertForMaskedLM.compiled_head  s    ||DIIf-..rW   r  r   rn   r   ro   r   rp   r   r   r   r  r  r   r  r  c                 H   ||n| j                   j                  }| j                          | j                   j                  dk(  r|||	|
)|'||j                  d d \  }
}n|j                  d d \  }
}||j
                  n|j
                  }|(t        j                  |
|f|t        j                        }|4t        j                         5  t        ||||      \  }}}}	}}d d d        nt        ||||      \  }}}}	}}| j                  ||||||||	|
||||      }|d   }| j                  rK|I|j                  d      }|j                  |j                  d   d      }|| j                  k7  }||   }||   }| j                   j                  r| j!                  |      n| j#                  | j%                  |            }d }|(| j'                  ||| j                   j(                        }| j                   j                  dk(  rN| j                   j*                  s|
t-               nt        j                         5  t/        |||
|	      }d d d        |s|f}||f|z   S |S t1        |||j2                  |j4                  
      S # 1 sw Y   xY w# 1 sw Y   HxY w)Nr   r   r   )rm   rn   ro   rp   r   rn   r   ro   r   r   r   r   r  r  r   r  r  r   rs   )r5   r  losslogitsr   r  )r   r  rx  r*  r   r   rz   r  r4  r  r   rD  rN   r   rO   rP   r  r[  r  loss_functionr5   rQ   r   r   r   r   r  )rS   r   rn   r   ro   r   rp   r   r   r   r  r  r   r  r  rT   r   outputsr  mask_tokensr  r  r[   s                          rV   r   zModernBertForMaskedLM.forward  s   0 &1%<k$++B]B]!;;++/BB:#5*:L%'/$0.;.A.A"1.E+
G.7oobq.A+
G-6-B))H\H\!)%*ZZW0Ef\a\f\f%gN ( [r#,^Zfou\X	7J
LRX 
 \s,^Zfou\XM7J
LRX **) 3%'!!!/!5#  
 $AJ!!f&8[[_F 1 6 6v||A K !D$A$AAK 1+ >K(F {{,, 01dii(9:; 	 %%ffAWAW%XD;;++/BB"&++"D"D\a\i\i\k r/vwV`ipqr YF)-)9TGf$EvE!//))	
 	
m ^r rs   JJJJ!NNNNNNNNNNNNNN)re   rf   rg   _tied_weights_keysr#   r4   r  r	   r   r  rz   r   r   r  r   r  r   r  r   r  r   r   r~   r4  r   r   r   rk   rl   s   @rV   rZ  rZ    s   
 ++/ &BII & U]]4 /ELL /U\\ / !/ ++FG&"$ 15156:/304)-*.-1$($(!%,0/3&*]
E,,-]
 !.]
 &ell3	]

 u||,]
  -]
 &]
 %,,']
 U\\*]
 SM]
 SM]
 #]
 $D>]
 'tn]
 d^]
" 
uU\\"N2	3#]
 H]
rW   rZ  zVThe ModernBert Model with a sequence classification head on top that performs pooling.c            #           e Zd Zdef fdZ ee       eee	e
      	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     deej                     dee   dee   dee   dee   dee   dee   deeej                     e	f   fd              Z xZS )r\  r   c                 n   t         |   |       |j                  | _        || _        t	        |      | _        t        |      | _        t        j                  j                  |j                        | _        t        j                  |j                  |j                        | _        | j!                          y r   )r3   r4   
num_labelsr   r  rD  rX  r  rz   r	   r   rJ   r   r   r7   r_  r  r   s     rV   r4   z,ModernBertForSequenceClassification.__init__  s      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJ 	rW   r  r   rn   r   ro   r   rp   r   r   r   r  r  r   r  r  rq   c                 f   ||n| j                   j                  }| j                          | j                  ||||||||	|
||||      }|d   }| j                   j                  dk(  r
|dddf   }nQ| j                   j                  dk(  r8||j                  d      z  j                  d      |j                  dd	
      z  }| j                  |      }| j                  |      }| j                  |      }d}|| j                   j                  | j                  dk(  rd| j                   _
        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _
        nd| j                   _
        | j                   j                  dk(  rIt!               }| j                  dk(  r& ||j#                         |j#                               }n |||      }n| j                   j                  dk(  r=t%               } ||j'                  d| j                        |j'                  d            }n,| j                   j                  dk(  rt)               } |||      }|s|f}||f|z   S |S t+        |||j,                  |j.                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r)   r*   rs   rx   r   T)ru   keepdim
regressionsingle_label_classificationmulti_label_classificationr  )r   r  rx  rD  r'   r  ry   r  r   r_  problem_typer  rv   rz   longr~   r   squeezer   r   r
   r   r   r  )rS   r   rn   r   ro   r   rp   r   r   r   r  r  r   r  r  rT   r  r  pooled_outputr  r  loss_fctr[   s                          rV   r   z+ModernBertForSequenceClassification.forward*  s   < &1%<k$++B]B]!**) 3%'!!!/!5#  
 $AJ;;))U2 1!Q$ 7[[++v5!2^5M5Mb5Q!Q V V[\ V ]`n`r`rt as a ! 		"34		-0/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./YF)-)9TGf$EvE'!//))	
 	
rW   r  )re   rf   rg   r#   r4   r   r  r   r  r   r  r   rz   r   r   r~   r4  r   r   r   rk   rl   s   @rV   r\  r\    s   
/  ++FG&,$ 15156:/304)-*.-1$($(!%,0/3&*W
E,,-W
 !.W
 &ell3	W

 u||,W
  -W
 &W
 %,,'W
 U\\*W
 SMW
 SMW
 #W
 $D>W
 'tnW
 d^W
" 
uU\\"$<<	=#W
 HW
rW   r\  zlThe ModernBert Model with a token classification head on top, e.g. for Named Entity Recognition (NER) tasks.c            #           e Zd Zdef fdZ ee       eee	e
      	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     deej                     dee   dee   dee   dee   dee   dee   deeej                     e	f   fd              Z xZS )r]  r   c                 `   t         |   |       |j                  | _        t        |      | _        t        |      | _        t        j                  j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r   r3   r4   r  r  rD  rX  r  rz   r	   r   rJ   r   r   r7   r_  r  r   s     rV   r4   z)ModernBertForTokenClassification.__init__  s{      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJ 	rW   r  r   rn   r   ro   r   rp   r   r   r   r  r  r   r  r  rq   c                    ||n| j                   j                  }| j                          | j                  ||||||||	|
||||      }|d   }| j	                  |      }| j                  |      }| j                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   rs   rx   r  )r   r  rx  rD  r  r   r_  r   r   r  r   r   r  )rS   r   rn   r   ro   r   rp   r   r   r   r  r  r   r  r  r  r  r  r  r  r[   s                        rV   r   z(ModernBertForTokenClassification.forward  s"   6 &1%<k$++B]B]!**) 3%'!!!/!5#  
 $AJ II&78 II&78!23')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
rW   r  )re   rf   rg   r#   r4   r   r  r   r  r   r  r   rz   r   r   r~   r4  r   r   r   rk   rl   s   @rV   r]  r]    sw   

/ 
 ++FG&)$ 15156:/304)-*.-1$($(!%,0/3&*;
E,,-;
 !.;
 &ell3	;

 u||,;
  -;
 &;
 %,,';
 U\\*;
 SM;
 SM;
 #;
 $D>;
 'tn;
 d^;
  
uU\\"$99	:!;
 H;
rW   r]  z
    The ModernBert Model with a span classification head on top for extractive question-answering tasks like SQuAD
    (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c            #           e Zd Zdef fdZ ee       eee	e
      	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     deej                     dee   dee   dee   dee   dee   dee   deeej                     e	f   fd              Z xZS )r^  r   c                 `   t         |   |       |j                  | _        t        |      | _        t        |      | _        t        j                  j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r   r  r   s     rV   r4   z'ModernBertForQuestionAnswering.__init__  sy      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJrW   r  r   rn   r   ro   start_positionsend_positionsr   r   r   r  r  r   r  r  rq   c                 T   ||n| j                   j                  }| j                          | j                  |||||||	|
||||      }|d   }| j	                  |      }| j                  |      }| j                  |      }|j                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d }|| | j                  ||||fi |}|s||f|dd  z   }||f|z   S |S t        ||||j                  |j                        S )N)rn   r   ro   r   r   r   r  r  r   r  r  r   rx   rs   r   )r  start_logits
end_logitsr   r  )r   r  rx  rD  r  r   r_  splitr  r   r  r   r   r  )rS   r   rn   r   ro   r  r  r   r   r   r  r  r   r  r  rT   r  r  r  r  r  r  r[   s                          rV   r   z&ModernBertForQuestionAnswering.forward  sg   0 &1%<k$++B]B]!**) 3%!!!/!5#  
 $AJ II&78 II&78!23#)<<r<#: j#++B/::<''+668
&=+D%4%%lJQ^ibhiD"J/'!"+=F)-)9TGf$EvE+%!!//))
 	
rW   r  )re   rf   rg   r#   r4   r   r  r   r  r   r  r   rz   r   r~   r4  r   r   r   rk   rl   s   @rV   r^  r^    sr   	/ 	 ++FG&0$ 266:/32604*.-1$($(!%,0/3&*;
ELL);
 !.;
 &ell3	;

 u||,;
 "%,,/;
  -;
 %,,';
 U\\*;
 SM;
 SM;
 #;
 $D>;
 'tn;
 d^;
" 
uU\\"$@@	A#;
 H;
rW   r^  )r#   r  rC  rZ  r\  r]  r^  r   r3  )ZrV  
contextlibr   typingr   r   r   r   r   rz   torch.nn.functionalr	   r   r  torch.utils.checkpointtorch.nnr
   r   r   activationsr   configuration_utilsr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   utils.import_utilsr   gemma.modeling_gemmar   r   flash_attn.flash_attn_interfacer    flash_attn.layers.rotaryr!   flash_attn.ops.triton.rotaryr"   objectr  r  
get_loggerre   ru  r#   r   r~   r   r   autogradFunctionr   r   r   rU  r   r   r   r   r4  r  r  rv   r  r  r1  r   r6  MODERNBERT_START_DOCSTRINGrC  r  r  rX  rZ  r\  r]  r^  __all__r2   rW   rV   <module>r
     s     " 8 8      A A ! 3 B  .  6 M P89O3 $			H	%A' AN ,0%)	&mLL&mLL&m 5<<(&m U\\"	&m
 5<<u||S(5<<:PRZ[`[g[gRhhi&mRLL\\  	
 \\>46%..11 46v *. $L &	L
 L42Q 2Qj299 <:BII :(_ 4 _ )."!"	" LL" 	"
 5++," 38_" 	" 
"  ~" 5u||+,eELL.AAB"\ !&(!!(!	(! 2(! 	(!
 (! 38_(! 	(! 
(! ++(! 5<<(!V ! 	  LL  	 
 5++,  38_  	  
  5<< H 1$"! M3")) M3`+3RYY +3\ " ^B B	BJ: z ^k:/ k:	k:\	>ryy 	> `}
5 }
	}
@ \k
*C k
	k
\ rN
'@ N
	N
b  M
%> M
M
`rW   