
    %	&h                        d dl Z d dlmZ d dlmZmZmZmZ d dlZd dl	m
c mZ d dlm
Z
 d dlmZmZmZ ddlmZ ddlmZ dd	lmZmZmZmZmZ dd
lmZmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z)  e$       rd dl*m+Z+ d dl,m-Z- d dl.m/Z/ ne0Z- e%jb                  e2      Z3dZ4dZ5 G d dejl                  jn                        Z8	 	 dUdeejr                     dee:   fdZ; G d de-      Z< G d de
jz                        Z> G d de
jz                        Z? G d  d!e
jz                        Z@d" ZAdVd#ZB	 dWd$d%d&ejr                  d'ejr                  d(ejr                  d)eej                     d*ee:e:f   d+e:d,e:d-eeD   d.eeejr                  ejr                  f   eejr                     f   fd/ZEej                  fd$d%d&ejr                  d0e<dejr                  de:d*ee:e:f   d+e:d,e:d1ej                  d.eejr                     fd2ZHd$d%d&ejr                  d'ejr                  d(ejr                  d)eej                     d*ee:e:f   d+e:d,e:d.eejr                     fd3ZIeHeEeId4ZJ G d5 d%e
jz                        ZK G d6 d7e
jz                        ZLd8ZM e"d9eM       G d: d;e             ZN	 	 dUd<ejr                  d'ejr                  d)eejr                     d=eejr                     d.eejr                  ejr                  ejr                  e:eejr                     eejr                     f   f
d>ZOd<ejr                  d?ejr                  d@e:dAe:d.ejr                  f
dBZPdCZQ e"d9eM       G dD dEeN             ZR G dF dGe
jz                        ZS e"dHeM       G dI dJeN             ZT e"dKeM       G dL dMeN             ZU e"dNeM       G dO dPeN             ZV e"dQeM       G dR dSeN             ZWg dTZXy)X    N)nullcontext)DictOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)_prepare_4d_attention_mask)BaseModelOutputMaskedLMOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardis_flash_attn_2_availablelogging)is_triton_available   )ModernBertConfig) flash_attn_varlen_qkvpacked_func)RotaryEmbedding)apply_rotaryzanswerdotai/ModernBERT-baser   c                   \    e Zd Ze	 	 ddeej                     dee   fd       Zed        Z	y)ApplyRotaryEmbUnpadN
cu_seqlens
max_seqlenc           
          |j                         }|j                  \  }}}}	|d d d df   j                  |d|	      }
t        |
||d||dd       | j	                  |||       || _        |S )N   r   FT)seqlen_offsetsr$   r%   interleavedinplace)
contiguousshapeviewr!   save_for_backwardr%   )ctxqkvcossinr$   r%   	total_nnz_three_nheadsheaddimqks              /var/www/pru.catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/modernbert/modeling_modernbert.pyforwardzApplyRotaryEmbUnpad.forwardC   s     nn.1ii+	67G BQBZ__YG4!!		
 	c3
3#
    c                     | j                   \  }}}|j                         }|j                  \  }}}}|d d d df   j                  |d|      }	t	        |	||d|| j
                  ddd	       |d d d d d d fS )Nr'   r(   r   FT)r)   r$   r%   r*   r+   	conjugate)saved_tensorsr,   r-   r.   r!   r%   )
r0   dor2   r3   r$   r4   r5   r6   r7   dqks
             r9   backwardzApplyRotaryEmbUnpad.backwardb   s    "00S*]]_.0hh+	67G BQBinnYG4!~~
	
 4tT455r;   NN)
__name__
__module____qualname__staticmethodr   torchTensorintr:   rA    r;   r9   r#   r#   B   sQ     .2$(
 U\\* SM < 6 6r;   r#   r$   r%   c                 4    t         j                  | ||||      S )a  
    Arguments:
        qkv: (total_nnz, 3, nheads, headdim) - input tensor for packed QKV.
        cos, sin: (seqlen_rotary, rotary_dim / 2)
        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
            of 1st half and 2nd half (GPT-NeoX style).
        inplace: if True, apply rotary embedding in-place.
        seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
            Most commonly used in inference when we have KV cache.
        cu_seqlens: (batch + 1,) or None
        max_seqlen: int
    Return:
        out: (total_nnz, dim)
    rotary_dim must be <= headdim
    Apply rotary embedding to the first rotary_dim of x.
    )r#   apply)r1   r2   r3   r$   r%   s        r9   apply_rotary_unpaddedrM   y   s    . $$S#sJ
KKr;   c                   "    e Zd ZdZ	 	 	 	 ddededee   deej                     deej                     f
 fdZ
	 ddej                  d	ej                  dee   d
eej                  eej                  ej                  f   f   fdZd
efdZ xZS )!ModernBertUnpaddedRotaryEmbeddingzP
    The rotary position embeddings applied directly to unpadded sequences.
    dimbaser%   devicedtypec                 v    t         |   ||d|d       || _        |||| j                  |||       yyyy)a  
        max_seqlen: if max_seqlen, device, and dtype are provided, we precompute the cos_sin_cache
            up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
            the cos_sin_cache wll be recomputed during the forward pass.
        TF)rP   rQ   pos_idx_in_fp32rR   r*   NrR   rS   )super__init__r%   _update_cos_sin_cache)selfrP   rQ   r%   rR   rS   	__class__s         r9   rX   z*ModernBertUnpaddedRotaryEmbedding.__init__   sV     	StT&^cd$!f&8U=N&&z&&N >O&8!r;   r1   r$   returnc                     |(| j                  ||j                  |j                         t        || j                  | j
                  ||      }|S )z
        Apply rotary embedding *inplace* to qkv.
        qkv: (total_nnz, 3, nheads, headdim)
        cu_seqlens: (batch + 1,) cumulative sequence lengths
        max_seqlen: int max seq length in the batch
        rV   r$   r%   )rY   rR   rS   rM   _cos_cached_sin_cached)rZ   r1   r$   r%   s       r9   r:   z)ModernBertUnpaddedRotaryEmbedding.forward   sS     !&&z#**CII&V#!!
 
r;   c                 T    d| j                    d| j                   d| j                   S )Nzdim=z, base=z, scale_base=)rP   rQ   
scale_baserZ   s    r9   
extra_reprz,ModernBertUnpaddedRotaryEmbedding.extra_repr   s(    dhhZwtyykt>OPPr;   )g     @NNNN)rC   rD   rE   __doc__rI   floatr   rG   rR   rS   rX   rH   r   r   r:   strrd   __classcell__r[   s   @r9   rO   rO      s     $()-'+OO O SM	O
 &O $O. %)	\\ LL SM	
 
u||U5<<#=>>	?2QC Qr;   rO   c                        e Zd ZdZdef fdZ ej                  d      dej                  dej                  fd       Z
	 ddeej                     d	eej                     dej                  fd
Z xZS )ModernBertEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    configc                 d   t         |           || _        t        j                  |j
                  |j                  |j                        | _        t        j                  |j                  |j                  |j                        | _        t        j                  |j                        | _        y )N)padding_idxepsbias)rW   rX   rm   r   	Embedding
vocab_sizehidden_sizepad_token_idtok_embeddings	LayerNormnorm_eps	norm_biasnormDropoutembedding_dropoutdroprZ   rm   r[   s     r9   rX   zModernBertEmbeddings.__init__   sw     ll6+<+<f>P>P^d^q^qrLL!3!3vO_O_`	JJv778	r;   Tdynamic	input_idsr\   c                 `    | j                  | j                  | j                  |                  S re   )r~   r{   rw   )rZ   r   s     r9   compiled_embeddingsz(ModernBertEmbeddings.compiled_embeddings   s%    yy4#6#6y#ABCCr;   inputs_embedsc                     |"| j                  | j                  |            }|S | j                  j                  r| j	                  |      n.| j                  | j                  | j                  |                  }|S re   )r~   r{   rm   reference_compiler   rw   )rZ   r   r   hidden_statess       r9   r:   zModernBertEmbeddings.forward   su     $ IIdii&>?M  ;;00 ((3YYtyy)<)<Y)GHI 
 r;   rB   )rC   rD   rE   rf   r   rX   rG   compile
LongTensorrH   r   r   r:   ri   rj   s   @r9   rl   rl      s    9/ 9 U]]4 DU-=-= D%,, D !D ei!%"2"23KSTYT`T`Ka	r;   rl   c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )ModernBertMLPa6  Applies the GLU at the end of each ModernBERT layer.

    Compared to the default BERT architecture, this block replaces :class:`~transformers.model.bert.modeling_bert.BertIntermediate`
    and :class:`~transformers.model.bert.modeling_bert.SelfOutput` with a single module that has similar functionality.
    rm   c                    t         |           || _        t        j                  |j
                  t        |j                        dz  |j                        | _	        t        |j                     | _        t        j                  |j                        | _        t        j                  |j                  |j
                  |j                        | _        y )Nr'   rr   )rW   rX   rm   r   Linearru   rI   intermediate_sizemlp_biasWir   hidden_activationactr|   mlp_dropoutr~   Wor   s     r9   rX   zModernBertMLP.__init__   s    ))F..F4L4L0MPQ0QX^XgXgh&223JJv112	))F44f6H6Hv_r;   r   r\   c                     | j                  |      j                  dd      \  }}| j                  | j                  | j	                  |      |z              S )Nr'   r(   rP   )r   chunkr   r~   r   )rZ   r   inputgates       r9   r:   zModernBertMLP.forward   sI    ggm,221"2=twwtyy%4!7899r;   )
rC   rD   rE   rf   r   rX   rG   rH   r:   ri   rj   s   @r9   r   r      s2    `/ `:U\\ :ell :r;   r   c            
            e Zd Zddedededeej                     f fdZ	 ej                         ed               Z xZS )ModernBertRotaryEmbeddingrm   rP   rQ   rR   c                    t         |           t        |d      rG|j                  ;|j                  j	                  d|j                  j	                  d            | _        nd| _        |j                  | _        |j                  | _        || _	        t        | j
                     | _        | j                  d |||      \  }| _        | j                  d|d       | j                  | _        y )	Nrope_scaling	rope_typetypedefault)rP   rQ   inv_freqF)
persistent)rW   rX   hasattrr   getr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrm   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)rZ   rm   rP   rQ   rR   r   r[   s         r9   rX   z"ModernBertRotaryEmbedding.__init__   s    6>*v/B/B/N#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T6sY]+<+^($(ZeD!%r;   c                 b   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   r(   r   mpscpuF)device_typeenabledr'   r   )rS   )r   rg   expandr-   torR   
isinstancer   rh   rG   autocast	transposecatr2   r   r3   rS   )
rZ   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr2   r3   s
             r9   r:   z!ModernBertRotaryEmbedding.forward  sV    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s    BF%%F.re   )rC   rD   rE   r   rI   rg   r   rG   rR   rX   no_gradr   r:   ri   rj   s   @r9   r   r      sV    // /c / /PXY^YeYePf /  U]]_<  <r;   r   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nr(   r'   r   )r-   rG   r   )r   x1x2s      r9   rotate_halfr     sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r;   c                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr2   r3   r   unsqueeze_dimq_embedk_embeds           r9   apply_rotary_pos_embr   "  sY    ( --
&C
--
&C3w;q>C/0G3w;q>C/0GGr;   moduleModernBertAttentionr1   attention_masksliding_window_maskr   local_attentionbsrP   output_attentionsr\   c	                    | j                  ||      \  }
}|j                  dd      j                  d      \  }}}t        |||
|      \  }}| j                  dz  }t        j                  ||j                  dd            |z  }|dk7  r|}||z   }t        j                  j                  |dt
        j                  	      j                  |j                        }t        j                  j                  || j                  | j                  
      }t        j                  ||      }|j                  dd      j!                         }|j#                  |d|      }|r||fS |fS )Nr   r   r   r'   r         ࿩r(   r(   r(   rP   rS   )ptraining)
rotary_embr   unbindr   head_dimrG   matmulr   
functionalsoftmaxfloat32r   rS   dropoutattention_dropoutr   r,   r.   )r   r1   r   r   r   r   r   rP   r   _kwargsr2   r3   querykeyvaluescaleattn_weightsattn_outputs                     r9   eager_attention_forwardr   =  sK      < @HCa+22q29E3%eS#s;JE3OOT!E<<s}}Q':;eCL(",.0L ==((2U]](SVVW\WbWbcL==((9Q9Q\b\k\k(lL,,|U3K''1-88:K""2r3/K\**>r;   r   target_dtypec	                     ||||      }|j                   t        j                  t        j                  fv}
|
rb|j                   }|j	                  |      }t        |||| j                  r| j                  nd| j                  |      }|j	                  |      }n3t        |||| j                  r| j                  nd| j                  |      }|j                  ||      fS )Nr^           )r$   r%   	dropout_pdeterministicwindow_size)
rS   rG   float16bfloat16r   r   r   r   deterministic_flash_attnr.   )r   r1   r   r$   r%   r   r   rP   r   r   convert_dtype
orig_dtypeattns                r9   flash_attention_forwardr   b  s     SZJ
GCIIemmU^^%DDM YY
ff\"/!!28//f..s 99'
 wwz"/!!28//f..s 99'
 IIb#  r;   c                 v   | j                  ||      \  }	}
|j                  dd      j                  d      \  }}}t        |||	|
      \  }}|dk7  r|}t	        j
                  |||| j                  r| j                  nd|      j                  dd      j                         }|j                  |d	|      }|fS )
Nr   r   r   r'   r   r   r   )r   	attn_maskr(   )
r   r   r   r   Fscaled_dot_product_attentionr   r   r,   r.   )r   r1   r   r   r   r   r   rP   r   r2   r3   r   r   r   r   s                  r9   sdpa_attention_forwardr     s       < @HCa+22q29E3%eS#s;JE3(", 	
&&28//f..s$	
 
1a	  ""2r3/K>r;   )flash_attention_2eagersdpac                   z     e Zd ZdZd	dedee   f fdZ	 d
dej                  dee
   dej                  fdZ xZS )r   a  Performs multi-headed self attention on a batch of unpadded sequences.

    If Flash Attention 2 is installed, this module uses Flash Attention to improve throughput.
    If Flash Attention 2 is not installed, the implementation will use PyTorch's SDPA kernel,
    which requires padding and unpadding inputs, adding some overhead.

    See `forward` method for additional details.
    rm   layer_idc                    t         |           || _        || _        |j                  |j
                  z  dk7  r&t        d|j                   d|j
                   d      |j                  | _        |j                  | _        |j
                  | _	        |j                  |j
                  z  | _
        | j                  | j                  z  | _        t        j                  |j                  d| j                  z  |j                        | _        ||j                   z  dk7  r$|j"                  dz  |j"                  dz  f| _        nd| _        |j$                  }|j&                  }| j"                  dk7  r$|j(                  |j(                  }|j"                  }|j*                  d	k(  rt-        | j                  ||
      | _        nt1        || j                  |      | _        t        j                  |j                  |j                  |j                        | _        |j                  dkD  rt        j4                  |j                        nt        j6                         | _        t;               | _        y )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   r   r'   r   r   )rP   r%   rQ   )rm   rP   rQ   r   )rW   rX   rm   r  ru   num_attention_heads
ValueErrorr   r   	num_headsr   all_head_sizer   r   attention_biasWqkvglobal_attn_every_n_layersr   global_rope_thetar   local_rope_theta_attn_implementationrO   r   r   r   r|   Identityout_dropsetpruned_heads)rZ   rm   r  
rope_thetar   r[   s        r9   rX   zModernBertAttention.__init__  s      : ::a?#F$6$6#77mnt  oI  oI  nJ  JK  L  "(!9!9(.(G(G%33**f.H.HH!]]T^^;IIf00!d6H6H2HvOdOde	f7771<$*$:$:a$?AWAW[\A\#]D #+D --
"("@"@8+&&2#44
&,&<&<#&&*==?MM.EJDO 8v4==_ijDO))F..0B0BI^I^_@F@X@X[^@^

6#;#;<dfdododqEr;   r   r   r\   c           
         | j                  |      }|j                  d   }| j                  j                  dk(  r)|j	                  dd| j
                  | j                        }n)|j	                  |dd| j
                  | j                        }t        | j                  j                     | f|| j                  | j                  || j                  |d|}|d   }| j                  | j                  |            }|f|dd  z   S )Nr   r   r(   r   )r1   r   r   r   rP   r   r   )r	  r-   rm   r  r.   r  r   MODERNBERT_ATTENTION_FUNCTIONr   r   r  r  r   )rZ   r   r   kwargsr1   r   attn_outputss          r9   r:   zModernBertAttention.forward  s     ii&  #;;++/BB((2q$..$--@C((2r1dnndmmDC4T[[5U5UV	
 00""/	
 	
 %Qdggm&<=,qr"222r;   re   F)rC   rD   rE   rf   r   r   rI   rX   rG   rH   boolr:   ri   rj   s   @r9   r   r     sS    &"/ &"8C= &"V -23||3 $D>3
 
3r;   c                   f    e Zd Zddedee   f fdZ ej                  d      dej                  dej                  fd       Z
	 	 	 	 	 	 ddej                  d	eej                     d
eej                     deej                     deej                     dee   dee   dej                  fdZ xZS )ModernBertEncoderLayerrm   r  c                    t         |           || _        |dk(  rt        j                         | _        n;t        j                  |j                  |j                  |j                        | _        t        ||      | _        t        j                  |j                  |j                  |j                        | _        t        |      | _        y )Nr   rp   )rm   r  )rW   rX   rm   r   r  	attn_normrx   ru   ry   rz   r   r   mlp_normr   mlprZ   rm   r  r[   s      r9   rX   zModernBertEncoderLayer.__init__  s    q=[[]DN\\&*<*<&//X^XhXhiDN'vI	V%7%7V__SYScScd (r;   Tr   r   r\   c                 B    | j                  | j                  |            S re   )r  r  rZ   r   s     r9   compiled_mlpz#ModernBertEncoderLayer.compiled_mlp  s    xxm455r;   r   r   r   r$   r%   r   c           	      
   | j                  | j                  |      ||||||      }||d   z   }| j                  j                  r| j	                  |      n| j                  | j                  |            }	||	z   }|f|dd  z   S )Nr   r   r   r$   r%   r   r   r   )r   r  rm   r   r"  r  r  )
rZ   r   r   r   r   r$   r%   r   r  
mlp_outputs
             r9   r:   zModernBertEncoderLayer.forward  s     yyNN=)) 3%!!/ ! 
 &Q7 {{,, m,$--67 	
 &
2,qr"222r;   re   )NNNNNF)rC   rD   rE   r   r   rI   rX   rG   r   rH   r"  r   r  r:   ri   rj   s   @r9   r  r    s    	)/ 	)8C= 	) U]]4 6%,, 65<< 6 !6 266:37-1$(,13||3 !.3 &ell3	3
 u//03 U\\*3 SM3 $D>3 
3r;   r  aO  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`ModernBertConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
zXThe bare ModernBert Model outputting raw hidden-states without any specific head on top.c                        e Zd ZeZdZdZddgZdZdZ	dZ
dej                  fdZe	 	 	 	 dded	eej$                     d
eeeeeef   f      def fd       Zd Z fdZ xZS )ModernBertPreTrainedModelmodelTrl   r  Fr   c                    | j                   j                  ddt        j                  dt        ffd}| j                   j
                  | j                   j
                  t        j                  d| j                   j                  z        z  | j                   j
                  | j                   j                  dz  d}t        |t              r ||j                  |d          y t        |t              r- ||j                  |d	           ||j                  |d
          y t        |t               r- ||j"                  |d	           ||j                  |d
          y t        |t$              r ||j&                  |d
          y t        |t(              r ||j*                  |d
          y t        |t,        t.        t0        f      r ||j2                  |d          y t        |t        j4                        rW|j6                  j8                  j;                  d       |j<                  %|j<                  j8                  j?                          y y y )Nr   r   stdc                    t         j                  j                  | j                  d| |z  |z         t	        | t         j
                        r7| j                  *t         j                  j                  | j                         y y y )Nr   )meanr*  ab)r   inittrunc_normal_weightr   r   rr   zeros_)r   r*  cutoff_factors     r9   init_weightz<ModernBertPreTrainedModel._init_weights.<locals>.init_weightX  sq    GG!! .3&#% "  &")),;;*GGNN6;;/ + -r;   g       @r   )inout	embedding	final_outr7  r5  r6  r8  g      ?) rm   initializer_cutoff_factorr   Modulerg   initializer_rangemathsqrtnum_hidden_layersru   r   rl   rw   r   r   r   r   r	  ModernBertPredictionHeaddenseModernBertForMaskedLMdecoder#ModernBertForSequenceClassification ModernBertForTokenClassificationModernBertForQuestionAnswering
classifierrx   r1  datafill_rr   zero_)rZ   r   r4  stdsr3  s       @r9   _init_weightsz'ModernBertPreTrainedModel._init_weightsS  s   == M	0		 	0 	0 ++//;;00499S4;;C`C`=`3aa6600$6	
 f23--tK/@A.		4:.		4;/ 34T$Z0		4;/ 89d5k2 56U402RTrs
 ))4+<=-MM$$S){{&  &&( ' .r;   use_flash_attention_2torch_dtype
device_mapcheck_device_mapc                     |j                   ,d|_         	 | j                  |t        j                  |d|      S t        |   ||t        j                  ||      S # t        t
        f$ r
 d |_         Y :w xY w)Nr   F)rM  rN  hard_check_onlyrO  )rL  rM  rN  rO  )_attn_implementation_internal_check_and_enable_flash_attn_2rG   r   r  ImportErrorrW   _autoset_attn_implementation)clsrm   rL  rM  rN  rO  r[   s         r9   rU  z6ModernBertPreTrainedModel._autoset_attn_implementation  s     //73FF0	<99 %)$)%5 :   w3"7!- 4 
 	
 , <7;4<s   #A A54A5c                    | j                   j                  du ry t        | d      rTt        | j                        dkD  r<| j                   j                  rt
        j                  d       d| j                   _        | j                  j                  dk(  r<| j                   j                  rt
        j                  d       d| j                   _        | j                  j                  dk(  r<| j                   j                  rt
        j                  d       d| j                   _        | j                   j                  t               | j                   _        y y )	NFhf_device_mapr   zqIf `accelerate` split the model across devices, `torch.compile` will not work. Falling back to non-compiled mode.r   z|Compiling the model with `torch.compile` and using a `torch.mps` device is not supported. Falling back to non-compiled mode.r   z|Compiling the model with `torch.compile` and using a `torch.cpu` device is not supported. Falling back to non-compiled mode.)
rm   r   r   lenrX  loggerwarning_oncerR   r   r   rc   s    r9   _maybe_set_compilez,ModernBertPreTrainedModel._maybe_set_compile  s   ;;((E14)c$2D2D.E.I{{,,##9 -2DKK);;u${{,,##9 -2DKK);;u${{,,##9 -2DKK);;((0,?,ADKK) 1r;   c                     t        |   |i |}| j                  j                  dv r<| j                  j                  rt        j                  d       d| j                  _        |S )N>   NTzcResizing token embeddings with `torch.compile` is not supported. Falling back to non-compiled mode.F)rW   resize_token_embeddingsrm   r   rZ  r[  )rZ   argsr  model_embedsr[   s       r9   r^  z1ModernBertPreTrainedModel.resize_token_embeddings  s[    w6GG;;((L8{{,,##y -2DKK)r;   )FNNT)rC   rD   rE   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_2_supports_sdpa_supports_flex_attnr   r:  rK  classmethodr  r   rG   rS   r   rh   r   rI   rU  r\  r^  ri   rj   s   @r9   r'  r'  F  s    
 $L&*#/1IJ!N-)BII -)^  ',-1;?!%
  $
 ekk*	

 U3S#X#678
 
 
@B>
 
r;   r'  inputslabelsc                    |j                  dt        j                        }t        j                  |j	                         d      j	                         }t        |j                         j                               }t        j                  j                  j                  t        j                  |dt        j                        d      }| j                         dk(  r| j	                         |   }n*| j                  ^}	}
}|	|
z  } | j                  |g| |   }||j	                         |   nd}||j	                         |   nd}||||||fS )	a  
    Remove padding from input sequences.

    Args:
        inputs: (batch, seqlen, ...) or (batch, seqlen)
        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
        position_ids: (batch, seqlen), int, position ids
        labels: (batch, seqlen), int, labels

    Returns:
        unpadded_inputs: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask.
        indices: (total_nnz)
        cu_seqlens: (batch + 1), the cumulative sequence lengths
        max_seqlen_in_batch: int
        unpadded_position_ids: (total_nnz) or None
        unpadded_labels: (total_nnz) or None
    r(   r   F)as_tupler   )r   r   r'   N)sumrG   int32nonzeroflattenrI   maxitemr   r   padcumsumrP   r-   r.   )ri  r   r   rj  seqlens_in_batchindicesmax_seqlen_in_batchr$   unpadded_inputsbatchseqlenrestr-   unpadded_position_idsunpadded_labelss                  r9   _unpad_modernbert_inputr~    s,   . &))b)DmmN224uEMMOG.22499;<$$((6FAUZU`U`)acijJzz|q ..*73%||v%&++e3d3G<?K?WL0027;]a393Efnn&w/4OGZ1DF[]lllr;   rv  ry  rz  c                 l   | j                         dk(  rHt        j                  ||z  | j                  | j                        }| ||<   |j                  ||      }|S | j                  ^}}t        j                  ||z  g|| j                  | j                  d}| ||<    |j
                  ||g| }|S )aQ  
    Add padding to sequences.

    Args:
        inputs: (total_nnz, ...) or (total_nnz,), where total_nnz = number of tokens selected in attention_mask.
        indices: (total_nnz)
        batch: int, batch size
        seqlen: int, max sequence length

    Returns:
        padded_inputs: (batch, seqlen, ...) or (batch, seqlen)
    r   )rS   rR   )rP   rG   zerosrS   rR   r.   r-   )ri  rv  ry  rz  outputpadded_inputs_r{  s           r9   _pad_modernbert_outputr    s    $ zz|qUV^6<<V wE62  <<DUV^]d]&,,v}}] w#E69D9r;   a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. With Flash Attention 2.0, padding will be ignored
            by default should you provide it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
            perform global attention, while the rest perform local attention. This mask is used to avoid attending to
            far-away tokens in the local attention layers when not using Flash Attention.
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
            Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
        cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
            Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
        max_seqlen (`int`, *optional*):
            Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
        batch_size (`int`, *optional*):
            Batch size of the input sequences. Used to pad the output tensors.
        seq_len (`int`, *optional*):
            Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c            "           e Zd Zdef fdZd Zd Z ee       e	e
ee      	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                      deej                      d	eej                     d
eej                      deej                      deej                      dee   dee   dee   dee   dee   dee   deeej                   df   ef   fd              Zdej                   dedej                   fdZ xZS )ModernBertModelrm   c           	         t         |   |       || _        t        |      | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _
        t        j                  |j                  |j                  |j                        | _        d| _        | j#                          y c c}w )Nrp   F)rW   rX   rm   rl   
embeddingsr   
ModuleListranger>  r  layersrx   ru   ry   rz   
final_normgradient_checkpointing	post_initr  s      r9   rX   zModernBertModel.__init__Y  s     .v6mmFKFLdLdFef(#FH5f
 ,,v'9'9vU[UeUef&+#	 gs   C c                 .    | j                   j                  S re   r  rw   rc   s    r9   get_input_embeddingsz$ModernBertModel.get_input_embeddingsd  s    ---r;   c                 &    || j                   _        y re   r  )rZ   r   s     r9   set_input_embeddingsz$ModernBertModel.set_input_embeddingsg  s    ).&r;   
checkpointoutput_typera  r   r   r   r   r   rv  r$   r%   
batch_sizeseq_lenr   output_hidden_statesreturn_dictr\   .c                 j  	
 ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt	        d      |rdnd }|rdnd }| j                          || j                  ||       	)
'||j                  d d \  	
n|j                  d d \  	
||j                  n|j                  }|(t        j                  	
f|t        j                        }d}| j                   j                  dk(  rM||d}|0t        j                         5  t        ||      ^}}}}d d d        nQt        ||      ^}}}}n>|&t        j                  
|	      j!                  d
      }| j#                  ||      \  }}| j%                  ||      }| j&                  D ]t  }|r||fz   }| j(                  r/| j*                  r#| j-                  |j.                  |||||||      }n ||||||||      }|d
   }|s]t1        |      dkD  sl||d   fz   }v |r||fz   }| j3                  |      }|r't5        |	
      }|t7        	
fd|D              }|st7        d |||fD              S t9        |||      S # 1 sw Y   xY w)Nz:You must specify exactly one of input_ids or inputs_embedsrJ   r'   rV   Fr   T)ri  r   )rR   r   )r   )r   r   r$  r   ri  rv  ry  rz  c              3   <   K   | ]  }t        |         yw)r  N)r  ).0hsr  rv  r  s     r9   	<genexpr>z*ModernBertModel.forward.<locals>.<genexpr>  s(      * +"gZ`ghh*s   c              3   &   K   | ]	  }||  y wre   rJ   )r  vs     r9   r  z*ModernBertModel.forward.<locals>.<genexpr>  s     mq_`_lms   )last_hidden_stater   
attentions)rm   r   r  use_return_dictr  r\  %warn_if_padding_and_no_attention_maskr-   rR   rG   onesr  r  r   r~  aranger   _update_attention_maskr  r  r  r   _gradient_checkpointing_func__call__rY  r  r  tupler   )rZ   r   r   r   r   r   rv  r$   r%   r  r  r   r  r  all_hidden_statesall_self_attentionsrR   repadr  r   encoder_layerlayer_outputss         `  ``           r9   r:   zModernBertModel.forwardj  sG   , 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]-t";<YZZ"6BD$5b4! 66y.Q'/(&3&9&9"1&=#
G&/oobq&9#
G%.%:!!@T@T!"ZZW(=fTYT^T^_N;;++/BB:#5*:L ( I`#,^JF	7J
Q 
 Ja,^JFM7J
Q #$||GFCMMaP262M2M2C 3N 3/N/ )=Y![[ 	PM#$58H$H!**t}} $ A A!**!"' %	! !.!#1(;!-))&7! *!,M S%7!%;&9]1=M<O&O#7	P:   1]4D D62$gZPWM !,$) */* %!
 m]4EGZ$[mmm++*
 	
A s   >J((J2c                     |r| j                   j                  dk(  r't        j                  d       d| j                   _        nF| j                   j                  dk7  r-t        j                  d| j                   j                   d       t	        || j
                        }t        j                  |j                  d         j                  d      }t        j                  ||j                  z
        }|| j                   j                  dz  k  j                  d      j                  d      j                  |j                        }|j                  |j!                         t        j"                  | j
                        j$                        }||fS )Nr   zOutputting attentions is only supported with the 'eager' attention implementation, not with "sdpa". Falling back to `attn_implementation="eager"`.r   zZOutputting attentions is only supported with the eager attention implementation, not with zT. Consider setting `attn_implementation="eager"`. Setting `output_attentions=False`.r'   r   )rm   r  rZ  r[  r   rS   rG   r  r-   r   absTr   r   rR   masked_filllogical_notfinfomin)rZ   r   r   global_attention_maskrowsdistancewindow_maskr   s           r9   r  z&ModernBertModel._update_attention_mask  sS   {{//69##V 4;011W<##  $ @ @A B:: !;>4:: V ||177:;EEaH99TDFF]+ 4499DDQGQQRSTWWXfXmXmn 	 4??@W@W@Y[`[f[fgkgqgq[r[v[vw$&999r;   NNNNNNNNNNNNN)rC   rD   rE   r   rX   r  r  r   MODERNBERT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   rG   r   rH   rI   r  r   r   r:   r  ri   rj   s   @r9   r  r  T  s   
	/ 	./ ++FG&#$ 15156:3704*.-1$($(!%,0/3&*t
E,,-t
 !.t
 &ell3	t

 u//0t
  -t
 %,,'t
 U\\*t
 SMt
 SMt
 #t
 $D>t
 'tnt
 d^t
 
uU\\3&'8	9t
 Ht
l:U\\ :VZ :_d_k_k :r;   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )r?  rm   c                 J   t         |           || _        t        j                  |j
                  |j
                  |j                        | _        t        |j                     | _
        t        j                  |j
                  |j                  |j                        | _        y )Nrp   )rW   rX   rm   r   r   ru   classifier_biasr@  r   classifier_activationr   rx   ry   rz   r{   r   s     r9   rX   z!ModernBertPredictionHead.__init__  sq    YYv1163E3EvG]G]^
&667LL!3!3vO_O_`	r;   r   r\   c                 `    | j                  | j                  | j                  |                  S re   )r{   r   r@  r!  s     r9   r:   z ModernBertPredictionHead.forward  s#    yy$**]";<==r;   )	rC   rD   rE   r   rX   rG   rH   r:   ri   rj   s   @r9   r?  r?    s-    a/ a>U\\ >ell >r;   r?  zZThe ModernBert Model with a decoder head on top that is used for masked language modeling.c            #       `    e Zd ZdgZdef fdZd Zdej                  fdZ	 e
j                  d      d	e
j                  d
e
j                  fd       Z ee       eeee      	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee
j*                     dee
j                     dee
j                     dee
j                     dee
j                     dee
j                     dee
j                     dee
j                     dee   dee   dee   dee   dee   dee   d
eee
j                     ef   fd              Z xZS )rA  zdecoder.weightrm   c                 t   t         |   |       || _        t        |      | _        t        |      | _        t        j                  |j                  |j                  |j                        | _        | j                  j                  | _        | j                  j                  | _        | j                          y )Nr   )rW   rX   rm   r  r(  r?  headr   r   ru   rt   decoder_biasrB  sparse_predictionsparse_pred_ignore_indexr  r   s     r9   rX   zModernBertForMaskedLM.__init__  s     $V,
,V4	yy!3!3V5F5FVM`M`a!%!>!>(,(L(L% 	r;   c                     | j                   S re   rB  rc   s    r9   get_output_embeddingsz+ModernBertForMaskedLM.get_output_embeddings&  s    ||r;   new_embeddingsc                     || _         y re   r  )rZ   r  s     r9   set_output_embeddingsz+ModernBertForMaskedLM.set_output_embeddings)  s	    %r;   Tr   r  r\   c                 B    | j                  | j                  |            S re   )rB  r  )rZ   r  s     r9   compiled_headz#ModernBertForMaskedLM.compiled_head,  s    ||DIIf-..r;   r  r   r   r   r   r   rj  rv  r$   r%   r  r  r   r  r  c                 H   ||n| j                   j                  }| j                          | j                   j                  dk(  r|||	|
)|'||j                  d d \  }
}n|j                  d d \  }
}||j
                  n|j
                  }|(t        j                  |
|f|t        j                        }|4t        j                         5  t        ||||      \  }}}}	}}d d d        nt        ||||      \  }}}}	}}| j                  ||||||||	|
||||      }|d   }| j                  rK|I|j                  d      }|j                  |j                  d   d      }|| j                  k7  }||   }||   }| j                   j                  r| j!                  |      n| j#                  | j%                  |            }d }|(| j'                  ||| j                   j(                        }| j                   j                  dk(  rN| j                   j*                  s|
t-               nt        j                         5  t/        |||
|	      }d d d        |s|f}||f|z   S |S t1        |||j2                  |j4                  
      S # 1 sw Y   xY w# 1 sw Y   HxY w)Nr   r'   rV   )ri  r   r   rj  r   r   r   r   r   rv  r$   r%   r  r  r   r  r  r   r(   )rt   r  losslogitsr   r  )rm   r  r\  r  r-   rR   rG   r  r  r   r~  r(  r  r.   r  r   r  rB  r  loss_functionrt   repad_logits_with_gradr   r  r   r   r  )rZ   r   r   r   r   r   rj  rv  r$   r%   r  r  r   r  r  r  rR   outputsr  mask_tokensr  r  r  s                          r9   r:   zModernBertForMaskedLM.forward0  s   0 &1%<k$++B]B]!;;++/BB:#5*:L%'/$0.;.A.A"1.E+
G.7oobq.A+
G-6-B))H\H\!)%*ZZW0Ef\a\f\f%gN ( [r#,^Zfou\X	7J
LRX 
 \s,^Zfou\XM7J
LRX **) 3%'!!!/!5#  
 $AJ!!f&8[[_F 1 6 6v||A K !D$A$AAK 1+ >K(F {{,, 01dii(9:; 	 %%ffAWAW%XD;;++/BB"&++"D"D\a\i\i\k r/vwV`ipqr YF)-)9TGf$EvE!//))	
 	
m ^r rs   JJJJ!NNNNNNNNNNNNNN)rC   rD   rE   _tied_weights_keysr   rX   r  r   r   r  rG   r   rH   r  r   r  r   r  r   r  r   r   rI   r  r   r   r:   ri   rj   s   @r9   rA  rA    s   
 ++/ &BII & U]]4 /ELL /U\\ / !/ ++FG&"$ 15156:/304)-*.-1$($(!%,0/3&*]
E,,-]
 !.]
 &ell3	]

 u||,]
  -]
 &]
 %,,']
 U\\*]
 SM]
 SM]
 #]
 $D>]
 'tn]
 d^]
" 
uU\\"N2	3#]
 H]
r;   rA  zVThe ModernBert Model with a sequence classification head on top that performs pooling.c            #           e Zd Zdef fdZ ee       eee	e
      	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     deej                     dee   dee   dee   dee   dee   dee   deeej                     e	f   fd              Z xZS )rC  rm   c                 n   t         |   |       |j                  | _        || _        t	        |      | _        t        |      | _        t        j                  j                  |j                        | _        t        j                  |j                  |j                        | _        | j!                          y re   )rW   rX   
num_labelsrm   r  r(  r?  r  rG   r   r|   classifier_dropoutr~   r   ru   rF  r  r   s     r9   rX   z,ModernBertForSequenceClassification.__init__  s      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJ 	r;   r  r   r   r   r   r   rj  rv  r$   r%   r  r  r   r  r  r\   c                 f   ||n| j                   j                  }| j                          | j                  ||||||||	|
||||      }|d   }| j                   j                  dk(  r
|dddf   }nQ| j                   j                  dk(  r8||j                  d      z  j                  d      |j                  dd	
      z  }| j                  |      }| j                  |      }| j                  |      }d}|| j                   j                  | j                  dk(  rd| j                   _
        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _
        nd| j                   _
        | j                   j                  dk(  rIt!               }| j                  dk(  r& ||j#                         |j#                               }n |||      }n| j                   j                  dk(  r=t%               } ||j'                  d| j                        |j'                  d            }n,| j                   j                  dk(  rt)               } |||      }|s|f}||f|z   S |S t+        |||j,                  |j.                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   rV  r,  r(   r   r   T)rP   keepdim
regressionsingle_label_classificationmulti_label_classificationr  )rm   r  r\  r(  classifier_poolingr   rm  r  r~   rF  problem_typer  rS   rG   longrI   r   squeezer
   r.   r	   r   r   r  )rZ   r   r   r   r   r   rj  rv  r$   r%   r  r  r   r  r  r  r  r  pooled_outputr  r  loss_fctr  s                          r9   r:   z+ModernBertForSequenceClassification.forward  s   < &1%<k$++B]B]!**) 3%'!!!/!5#  
 $AJ;;))U2 1!Q$ 7[[++v5!2^5M5Mb5Q!Q V V[\ V ]`n`r`rt as a ! 		"34		-0/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./YF)-)9TGf$EvE'!//))	
 	
r;   r  )rC   rD   rE   r   rX   r   r  r   r  r   r  r   rG   r   rH   rI   r  r   r   r:   ri   rj   s   @r9   rC  rC    s   
/  ++FG&,$ 15156:/304)-*.-1$($(!%,0/3&*W
E,,-W
 !.W
 &ell3	W

 u||,W
  -W
 &W
 %,,'W
 U\\*W
 SMW
 SMW
 #W
 $D>W
 'tnW
 d^W
" 
uU\\"$<<	=#W
 HW
r;   rC  zlThe ModernBert Model with a token classification head on top, e.g. for Named Entity Recognition (NER) tasks.c            #           e Zd Zdef fdZ ee       eee	e
      	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     deej                     dee   dee   dee   dee   dee   dee   deeej                     e	f   fd              Z xZS )rD  rm   c                 `   t         |   |       |j                  | _        t        |      | _        t        |      | _        t        j                  j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y re   rW   rX   r  r  r(  r?  r  rG   r   r|   r  r~   r   ru   rF  r  r   s     r9   rX   z)ModernBertForTokenClassification.__init__  s{      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJ 	r;   r  r   r   r   r   r   rj  rv  r$   r%   r  r  r   r  r  r\   c                    ||n| j                   j                  }| j                          | j                  ||||||||	|
||||      }|d   }| j	                  |      }| j                  |      }| j                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r(   r   r  )rm   r  r\  r(  r  r~   rF  r
   r.   r  r   r   r  )rZ   r   r   r   r   r   rj  rv  r$   r%   r  r  r   r  r  r  r  r  r  r  r  s                        r9   r:   z(ModernBertForTokenClassification.forward  s"   6 &1%<k$++B]B]!**) 3%'!!!/!5#  
 $AJ II&78 II&78!23')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r;   r  )rC   rD   rE   r   rX   r   r  r   r  r   r  r   rG   r   rH   rI   r  r   r   r:   ri   rj   s   @r9   rD  rD    sw   

/ 
 ++FG&)$ 15156:/304)-*.-1$($(!%,0/3&*;
E,,-;
 !.;
 &ell3	;

 u||,;
  -;
 &;
 %,,';
 U\\*;
 SM;
 SM;
 #;
 $D>;
 'tn;
 d^;
  
uU\\"$99	:!;
 H;
r;   rD  z
    The ModernBert Model with a span classification head on top for extractive question-answering tasks like SQuAD
    (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c            #           e Zd Zdef fdZ ee       eee	e
      	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     deej                     dee   dee   dee   dee   dee   dee   deeej                     e	f   fd              Z xZS )rE  rm   c                 `   t         |   |       |j                  | _        t        |      | _        t        |      | _        t        j                  j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y re   r  r   s     r9   rX   z'ModernBertForQuestionAnswering.__init__e  sy      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJr;   r  r   r   r   r   start_positionsend_positionsrv  r$   r%   r  r  r   r  r  r\   c                 T   ||n| j                   j                  }| j                          | j                  |||||||	|
||||      }|d   }| j	                  |      }| j                  |      }| j                  |      }|j                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d }|| | j                  ||||fi |}|s||f|dd  z   }||f|z   S |S t        ||||j                  |j                        S )N)r   r   r   rv  r$   r%   r  r  r   r  r  r   r   r(   r   )r  start_logits
end_logitsr   r  )rm   r  r\  r(  r  r~   rF  splitr  r,   r  r   r   r  )rZ   r   r   r   r   r  r  rv  r$   r%   r  r  r   r  r  r  r  r  r  r  r  r  r  s                          r9   r:   z&ModernBertForQuestionAnswering.forwardp  sg   0 &1%<k$++B]B]!**) 3%!!!/!5#  
 $AJ II&78 II&78!23#)<<r<#: j#++B/::<''+668
&=+D%4%%lJQ^ibhiD"J/'!"+=F)-)9TGf$EvE+%!!//))
 	
r;   r  )rC   rD   rE   r   rX   r   r  r   r  r   r  r   rG   rH   rI   r  r   r   r:   ri   rj   s   @r9   rE  rE  ]  sr   	/ 	 ++FG&0$ 266:/32604*.-1$($(!%,0/3&*;
ELL);
 !.;
 &ell3	;

 u||,;
 "%,,/;
  -;
 %,,';
 U\\*;
 SM;
 SM;
 #;
 $D>;
 'tn;
 d^;
" 
uU\\"$@@	A#;
 H;
r;   rE  )r  r'  rA  rC  rD  rE  rB   )Nr   r  )Yr<  
contextlibr   typingr   r   r   r   rG   torch.nn.functionalr   r   r   torch.nnr	   r
   r   activationsr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   utilsr   r   r   r   r   utils.import_utilsr   configuration_modernbertr   flash_attn.flash_attn_interfacer   flash_attn.layers.rotaryr    flash_attn.ops.triton.rotaryr!   object
get_loggerrC   rZ  r  r  autogradFunctionr#   rH   rI   rM   rO   r:  rl   r   r   r   r   r   r  r   r   rS   r   r   r  r   r  MODERNBERT_START_DOCSTRINGr'  r~  r  r  r  r?  rA  rC  rD  rE  __all__rJ   r;   r9   <module>r     s  ,  " / /     A A ! B  L -  6 6 P89O			H	%3 $46%..11 46v *. $L &	L
 L42Q 2Qj299 <:BII :(<		 <B(H )."!"	" LL" 	"
 5++," 38_" 	" 
"  ~" 5u||+,eELL.AAB"\ !&(!!(!	(! 2(! 	(!
 (! 38_(! 	(! 
(! ++(! 5<<(!V ! 	  LL  	 
 5++,  38_  	  
  5<< H 1$"! M3")) M3`+3RYY +3\ " ^B B	BP ,0%)	&mLL&mLL&m 5<<(&m U\\"	&m
 5<<u||S(5<<:PRZ[`[g[gRhhi&mRLL\\  	
 \\>: z ^k:/ k:	k:\	>ryy 	> `}
5 }
	}
@ \k
*C k
	k
\ rN
'@ N
	N
b  M
%> M
M
`r;   