
    %	&h                        d Z ddlZddlmZ ddlmZmZmZm	Z	m
Z
mZ ddlZddlZddlmZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZmZmZ ddlmZmZ ddlm Z m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z)  e$jT                  e+      Z,dZ-dZ.g dZ/e G d de#             Z0e G d de#             Z1 G d dejd                        Z3 G d dejd                        Z4 G d dejd                        Z5	 dWdejd                  dejl                  d ejl                  d!ejl                  d"eejl                     d#e7d$e7fd%Z8 G d& d'ejd                        Z9 G d( d)ejd                        Z: G d* d+ejd                        Z; G d, d-ejd                        Z< G d. d/ejd                        Z= G d0 d1ejd                        Z> G d2 d3ejd                        Z? G d4 d5ejd                        Z@d6 ZA G d7 d8ejd                        ZB G d9 d:ejd                        ZC G d; d<ejd                        ZD G d= d>ejd                        ZE G d? d@e      ZFdAZGdBZH edCeG       G dD dEeF             ZI G dF dGejd                        ZJ G dH dIejd                        ZK G dJ dKejd                        ZL edLeG       G dM dNeF             ZM G dO dPejd                        ZN G dQ dRejd                        ZO edSeG       G dT dUeF             ZPg dVZQy)XzPyTorch DPT (Dense Prediction Transformers) model.

This implementation is heavily inspired by OpenMMLab's implementation, found here:
https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/dpt_head.py.

    N)	dataclass)CallableListOptionalSetTupleUnion)nn)CrossEntropyLoss   )ACT2FN)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardreplace_return_docstrings)BaseModelOutputDepthEstimatorOutputSemanticSegmenterOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputlogging	torch_int)load_backbone   )	DPTConfigr   zIntel/dpt-large)r   iA  i   c                   l    e Zd ZU dZdZeej                     ed<   dZ	ee
ej                  df      ed<   y)*BaseModelOutputWithIntermediateActivationsa#  
    Base class for model's outputs that also contains intermediate activations that can be used at later stages. Useful
    in the context of Vision models.:

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
            Intermediate activations that can be used to compute hidden states of the model at various layers.
    Nlast_hidden_states.intermediate_activations)__name__
__module____qualname____doc__r!   r   torchFloatTensor__annotations__r"   r        z/var/www/pru.catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/dpt/modeling_dpt.pyr    r    8   s?    	 7;!2!23:HLhuU->->-C'DELr+   r    c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   dZeeej                  df      ed<   y)	4BaseModelOutputWithPoolingAndIntermediateActivationsa  
    Base class for model's outputs that also contains a pooling of the last hidden states as well as intermediate
    activations that can be used by the model at later stages.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
            Last layer hidden-state of the first token of the sequence (classification token) after further processing
            through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
            the classification token after processing through a linear layer and a tanh activation function. The linear
            layer weights are trained from the next sentence prediction (classification) objective during pretraining.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
            Intermediate activations that can be used to compute hidden states of the model at various layers.
    Nlast_hidden_statepooler_output.hidden_states
attentionsr"   )r#   r$   r%   r&   r/   r   r'   r(   r)   r0   r1   r   r2   r"   r*   r+   r,   r.   r.   I   s    6 6:x 1 12915M8E--.5=AM8E%"3"3S"89:A:>Ju00#567>HLhuU->->-C'DELr+   r.   c            	       p     e Zd ZdZd	 fd	Zd
dZ	 ddej                  dededej                  fdZ	 xZ
S )DPTViTHybridEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                 b   t         
|           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }t        |      | _        | j                  j                  d   }t        | j                  j                        dk7  r+t        dt        | j                  j                               ddg| _        ||j                   }	|	dd  }|	d   }nCt        |t        j                  j                        r|n||f}| j                  j                  d   }|| _        |d   | _        || _        t#        j$                  ||d      | _        t#        j(                  t+        j,                  dd|j
                              | _        t#        j(                  t+        j,                  d|dz   |j
                              | _        y )Nr   r   r   z1Expected backbone to have 3 output features, got kernel_size)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterabler   backbonechannelslen
ValueErrorresidual_feature_map_indexbackbone_featmap_shaper
   Conv2d
projection	Parameterr'   zeros	cls_tokenposition_embeddings)selfconfigfeature_sizer<   r=   r>   r?   num_patchesfeature_dimfeat_map_shape	__class__s             r,   r;   zDPTViTHybridEmbeddings.__init__t   s   !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY%f-mm,,R0t}}%%&!+PQTUYUbUbUkUkQlPmnoo+,a&'#::N)"#.L(+K !+<9Q9Q RYegsXt  --004K$$Q-())K!Lekk!Q8J8J&KL#%<<A{QPVPbPb0c#d r+   c                 r   |d d d |f   }|d|d f   }t        t        |      dz        }|j                  d||d      j                  dddd      }t        j
                  j                  |||fd      }|j                  dddd      j                  d||z  d      }t        j                  ||gd	      }|S 
Nr         ?r   r6   r      bilinear)sizemodedim)	r   rF   reshapepermuter
   
functionalinterpolater'   catrP   posembgrid_size_heightgrid_size_widthstart_index
posemb_tokposemb_gridold_grid_sizes           r,   _resize_pos_embedz(DPTViTHybridEmbeddings._resize_pos_embed   s    A||O,
Q_-!#k"2c"9:!))!]M2NVVWXZ[]^`abmm//CSUdBelv/w!))!Q15==aAQTcAceghJ4!<r+   pixel_valuesinterpolate_pos_encodingreturn_dictreturnc                    |j                   \  }}}}|| j                  k7  rt        d      |sV|| j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d	      | j	                  | j
                  || j                  z  || j                  z        }| j                  |      }	|	j                  d   }
| j                  D cg c]  }|	j                  |    }}| j                  |
      j                  d	      j                  dd	      }| j                  j                  |dd      }t        j                   ||fd
      }||z   }|s||fS t#        ||      S c c}w )NeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model (z).r6   rZ   r^   )r!   r"   )shaper>   rG   r<   rm   rO   r=   rD   feature_mapsrH   rK   flatten	transposerN   expandr'   rd   r    )rP   rn   ro   rp   
batch_sizer>   heightwidthrO   backbone_outputfeaturesindexoutput_hidden_states
embeddings
cls_tokenss                  r,   forwardzDPTViTHybridEmbeddings.forward   s    3?2D2D/
L&%4,,,w  (++u8J/J (% 9+,Adooa.@-AE 
 #44$$f&?$//AY
 --5"//3 RVQpQpq < <U Cqq__X.66q9CCAqI
^^**:r2>
YY
J7Q?
  "55
 455 :)%9
 	
  rs   )E?Nr   )FF)r#   r$   r%   r&   r;   rm   r'   Tensorboolr   __classcell__rV   s   @r,   r4   r4   m   sH     eD gl)
!LL)
DH)
_c)
	)
r+   r4   c                   2     e Zd ZdZ fdZddZddZ xZS )DPTViTEmbeddingszB
    Construct the CLS token, position and patch embeddings.

    c                    t         |           t        j                  t	        j
                  dd|j                              | _        t        |      | _	        | j                  j                  }t        j                  t	        j
                  d|dz   |j                              | _        t        j                  |j                        | _        || _        y )Nr   )r:   r;   r
   rL   r'   rM   r?   rN   DPTViTPatchEmbeddingspatch_embeddingsrS   rO   Dropouthidden_dropout_probdropoutrQ   )rP   rQ   rS   rV   s      r,   r;   zDPTViTEmbeddings.__init__   s    ekk!Q8J8J&KL 5f =++77#%<<A{QPVPbPb0c#d zz&"<"<=r+   c                 ~   |d d d |f   }|d|d f   }t        |j                  d      dz        }|j                  d||d      j                  dddd      }t        j
                  j                  |||fd      }|j                  dddd      j                  d||z  d      }t        j                  ||gd	      }|S rX   )	r   r\   r`   ra   r
   rb   rc   r'   rd   re   s           r,   rm   z"DPTViTEmbeddings._resize_pos_embed   s    A||O,
Q_-!+"2"21"5"<=!))!]M2NVVWXZ[]^`abmm//CSUdBelv/w!))!Q15==aAQTcAceghJ4!<r+   c                    |j                   \  }}}}| j                  j                  }| j                  | j                  ||z  ||z        }| j                  |      }	|	j                         \  }}
}| j                  j                  |dd      }t        j                  ||	fd      }	|	|z   }	| j                  |	      }	|s|	fS t        |	      S )Nr6   r   r^   )r!   )ru   rQ   r=   rm   rO   r   r\   rN   ry   r'   rd   r   r    )rP   rn   rp   rz   r>   r{   r|   r=   rO   r   seq_len_r   s                r,   r   zDPTViTEmbeddings.forward   s    2>2D2D/
L&% [[++
"44$$f
&:EZ<O
 **<8
!+!2
GQ ^^**:r2>
YY
J7Q?
  "55
\\*-
= 9ZXXr+   r   )F)r#   r$   r%   r&   r;   rm   r   r   r   s   @r,   r   r      s    
Yr+   r   c                   (     e Zd ZdZ fdZd Z xZS )r   z$
    Image to Patch Embedding.

    c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        || _
        t        j                  ||||      | _        y )Nr   r   )r9   stride)r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rS   r
   rJ   rK   )rP   rQ   r<   r=   r>   r?   rS   rV   s          r,   r;   zDPTViTPatchEmbeddings.__init__  s    !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&))L+:^hir+   c                     |j                   \  }}}}|| j                  k7  rt        d      | j                  |      j	                  d      j                  dd      }|S )Nrs   rZ   r   )ru   r>   rG   rK   rw   rx   )rP   rn   rz   r>   r{   r|   r   s          r,   r   zDPTViTPatchEmbeddings.forward  sb    2>2D2D/
L&%4,,,w  __\2::1=GG1M
r+   r#   r$   r%   r&   r;   r   r   r   s   @r,   r   r   
  s    
jr+   r   modulequerykeyvalueattention_maskscalingr   c                    t        j                  ||j                  dd            |z  }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }|||z  }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr6   r7   )r_   dtype)ptrainingr   rZ   )r'   matmulrx   r
   rb   softmaxfloat32tor   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r,   eager_attention_forwardr   *  s     <<s}}R'<=GL ==((2U]](SVVW\WbWbcL ==((6??([L !#n4,,|U3K''1-88:K$$r+   c            
            e Zd Zdeddf fdZdej                  dej                  fdZ	 d
deej                     de	de
eej                  ej                  f   eej                     f   fd	Z xZS )DPTSelfAttentionrQ   rq   Nc                 2   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        |j                  | _        | j                  dz  | _        d| _        t        j                  |j                  | j                  |j                         | _        t        j                  |j                  | j                  |j                         | _        t        j                  |j                  | j                  |j                         | _        y )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .g      F)bias)r:   r;   r?   num_attention_headshasattrrG   rQ   intattention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr
   Linearqkv_biasr   r   r   rP   rQ   rV   s     r,   r;   zDPTSelfAttention.__init__J  sF    : ::a?PVXhHi"6#5#5"6 7334A7 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PP"??//5YYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
r+   xc                     |j                         d d | j                  | j                  fz   }|j                  |      }|j	                  dddd      S )Nr6   r   rZ   r   r   )r\   r   r   viewra   )rP   r   new_x_shapes      r,   transpose_for_scoresz%DPTSelfAttention.transpose_for_scores^  sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r+   	head_maskoutput_attentionsc           
         | j                  | j                  |            }| j                  | j                  |            }| j                  | j                  |            }t        }| j
                  j                  dk7  rN| j
                  j                  dk(  r|rt        j                  d       nt        | j
                  j                     } || ||||| j                  | j                  | j                  sdn| j                        \  }}	|j                         d d | j                  fz   }
|j!                  |
      }|r||	f}|S |f}|S )Neagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r   r7   )r   r   r   r   r   rQ   _attn_implementationloggerwarning_oncer   r   r   r   r   r\   r   r`   )rP   r1   r   r   	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapeoutputss               r,   r   zDPTSelfAttention.forwardc  s=    --dhh}.EF	//

=0IJ//

=0IJ(?;;++w6{{//69>O##L
 '>dkk>^>^&_#)<nnLL#}}C$2C2C	*
& #0"4"4"6s";t?Q?Q>S"S%--.EF6G=/2 O\M]r+   NF)r#   r$   r%   r   r;   r'   r   r   r   r   r	   r   r   r   r   s   @r,   r   r   I  s    ]y ]T ](%ell %u|| % bg!(0(>!Z^!	uU\\5<</0%2EE	F!r+   r   c                   |     e Zd ZdZdeddf fdZdej                  dej                  dej                  fdZ xZ	S )	DPTViTSelfOutputz
    The residual connection is defined in DPTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rQ   rq   Nc                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y r   )	r:   r;   r
   r   r?   denser   r   r   r   s     r,   r;   zDPTViTSelfOutput.__init__  sB    YYv1163E3EF
zz&"<"<=r+   r1   input_tensorc                 J    | j                  |      }| j                  |      }|S r   r   r   rP   r1   r   s      r,   r   zDPTViTSelfOutput.forward  s$    

=1]3r+   )
r#   r$   r%   r&   r   r;   r'   r   r   r   r   s   @r,   r   r     sD    
>y >T >
U\\  RWR^R^ r+   r   c                        e Zd Zdeddf fdZdee   ddfdZ	 	 ddej                  de
ej                     d	edeeej                  ej                  f   eej                     f   fd
Z xZS )DPTViTAttentionrQ   rq   Nc                     t         |           t        |      | _        t	        |      | _        t               | _        y r   )r:   r;   r   	attentionr   outputsetpruned_headsr   s     r,   r;   zDPTViTAttention.__init__  s0    )&1&v.Er+   headsc                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r^   )rF   r   r   r   r   r   r   r   r   r   r   r   r   union)rP   r   r   s      r,   prune_headszDPTViTAttention.prune_heads  s   u:?74>>55t~~7Y7Y[_[l[l
u
  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r+   r1   r   r   c                 h    | j                  |||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   )r   r   )rP   r1   r   r   self_outputsattention_outputr   s          r,   r   zDPTViTAttention.forward  sE     ~~mY@QR;;|AF#%QR(88r+   r   )r#   r$   r%   r   r;   r   r   r   r'   r   r   r   r	   r   r   r   r   s   @r,   r   r     s    "y "T ";S ;d ;, -1"'	|| ELL)  	
 
uU\\5<</0%2EE	Fr+   r   c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )DPTViTIntermediaterQ   rq   Nc                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r:   r;   r
   r   r?   intermediate_sizer   r@   
hidden_actstrr   intermediate_act_fnr   s     r,   r;   zDPTViTIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r+   r1   c                 J    | j                  |      }| j                  |      }|S r   )r   r   )rP   r1   s     r,   r   zDPTViTIntermediate.forward  s&    

=100?r+   	r#   r$   r%   r   r;   r'   r   r   r   r   s   @r,   r   r     s1    9y 9T 9U\\ ell r+   r   c                   x     e Zd Zdeddf fdZdej                  dej                  dej                  fdZ xZS )DPTViTOutputrQ   rq   Nc                     t         |           t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        y r   )
r:   r;   r
   r   r   r?   r   r   r   r   r   s     r,   r;   zDPTViTOutput.__init__  sB    YYv779K9KL
zz&"<"<=r+   r1   r   c                 T    | j                  |      }| j                  |      }||z   }|S r   r   r   s      r,   r   zDPTViTOutput.forward  s.    

=1]3%4r+   r   r   s   @r,   r   r     s?    >y >T >
U\\  RWR^R^ r+   r   c                        e Zd ZdZdeddf fdZ	 	 d
dej                  deej                     de	de
eej                  ej                  f   eej                     f   fd	Z xZS )DPTViTLayerz?This corresponds to the Block class in the timm implementation.rQ   rq   Nc                 r   t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        y )Nr   eps)r:   r;   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r
   	LayerNormr?   layer_norm_epslayernorm_beforelayernorm_afterr   s     r,   r;   zDPTViTLayer.__init__  s    '-'E'E$(0.v6"6* "V-?-?VEZEZ [!||F,>,>FDYDYZr+   r1   r   r   c                     | j                  | j                  |      ||      }|d   }|dd  }||z   }| j                  |      }| j                  |      }| j	                  ||      }|f|z   }|S )N)r   r   r   )r   r  r  r   r   )rP   r1   r   r   self_attention_outputsr   r   layer_outputs           r,   r   zDPTViTLayer.forward  s     "&!!-0/ "0 "

 2!4(, )=8 ++M:((6 {{<?/G+r+   r   )r#   r$   r%   r&   r   r;   r'   r   r   r   r	   r   r   r   r   s   @r,   r   r     s    I[y [T [ -1"'	|| ELL)  	
 
uU\\5<</0%2EE	Fr+   r   c                        e Zd Zdeddf fdZ	 	 	 	 ddej                  deej                     deded	ede	e
ef   fd
Z xZS )DPTViTEncoderrQ   rq   Nc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r   )
r:   r;   rQ   r
   
ModuleListrangenum_hidden_layersr   layergradient_checkpointingrP   rQ   r   rV   s      r,   r;   zDPTViTEncoder.__init__  sN    ]]vG_G_A`#aAK$7#ab
&+# $bs   A#r1   r   r   r   rp   c                 t   |rdnd }|rdnd }t        | j                        D ]h  \  }}	|r||fz   }|||   nd }
| j                  r+| j                  r| j	                  |	j
                  ||
|      }n
 |	||
|      }|d   }|s`||d   fz   }j |r||fz   }|st        d |||fD              S t        |||      S )Nr*   r   r   c              3   &   K   | ]	  }||  y wr   r*   ).0vs     r,   	<genexpr>z(DPTViTEncoder.forward.<locals>.<genexpr>;  s     mq_`_lms   )r/   r1   r2   )	enumerater  r  r   _gradient_checkpointing_func__call__tupler   )rP   r1   r   r   r   rp   all_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputss               r,   r   zDPTViTEncoder.forward  s     #7BD$5b4(4 	POA|#$58H$H!.7.CilO**t}} $ A A ))!#%	! !-]OM^ _)!,M &9]1=M<O&O#'	P*   1]4D Dm]4EGZ$[mmm++*
 	
r+   )NFFT)r#   r$   r%   r   r;   r'   r   r   r   r	   r  r   r   r   r   s   @r,   r	  r	    sz    ,y ,T , -1"'%* )
||)
 ELL))
  	)

 #)
 )
 
uo%	&)
r+   r	  c                   t     e Zd ZdZ fdZd Zd Zddeej                     deej                     fdZ
 xZS )	DPTReassembleStagea@  
    This class reassembles the hidden states of the backbone into image-like feature representations at various
    resolutions.

    This happens in 3 stages:
    1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to
       `config.readout_type`.
    2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
    3. Resizing the spatial dimensions (height, width).

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
    c                     t         |           || _        t        j                         | _        |j                  r| j                  |       n| j                  |       |j                  | _	        y r   )
r:   r;   rQ   r
   r  layers	is_hybrid_init_reassemble_dpt_hybrid_init_reassemble_dptneck_ignore_stagesr   s     r,   r;   zDPTReassembleStage.__init__S  sU    mmo,,V4%%f-"(";";r+   c           	      v   t        t        t        |j                              |j                        D ]r  \  }}|dk  r.| j
                  j                  t        j                                9|dkD  s?| j
                  j                  t        ||j                  |   |             t |j                  dk7  rt        d|j                   d      t        j                         | _        t        |      }t        t        |j                              D ]  }|dk  rA| j                  j                  t        j                  t        j                                      I|dkD  sO| j                  j                  t        j                  t        j                   d|z  |      t"        |j$                                   y)a   "
        For DPT-Hybrid the first 2 reassemble layers are set to `nn.Identity()`, please check the official
        implementation: https://github.com/isl-org/DPT/blob/f43ef9e08d70a752195028a51be5e1aff227b913/dpt/vit.py#L438
        for more details.
        r   rE   factorprojectzReadout type z! is not supported for DPT-Hybrid.rZ   N)zipr  rF   neck_hidden_sizesreassemble_factorsr#  appendr
   IdentityDPTReassembleLayerreadout_typerG   r  readout_projects_get_backbone_hidden_size
Sequentialr   r   r   )rP   rQ   r  r*  r?   s        r,   r%  z.DPTReassembleStage._init_reassemble_dpt_hybrid_  sX    U3v'?'?#@A6C\C\] 	tIAvAv""2;;=1Q""#5fvG_G_`aGbkq#rs		t )+}V-@-@,AAbcdd !#/7s63345 	AAv%%,,R]]2;;=-IJQ%%,,MM"))AO["I6RXRcRcKde		r+   c           	      <   t        t        t        |j                              |j                        D ]9  \  }}| j
                  j                  t        ||j                  |   |             ; |j                  dk(  rt        j                         | _        t        |      }t        t        |j                              D ]Y  }| j                  j                  t        j                  t        j                  d|z  |      t        |j                                   [ y y )Nr)  r+  rZ   )r,  r  rF   r-  r.  r#  r/  r1  r2  r
   r  r3  r4  r5  r   r   r   )rP   rQ   r  r*  r?   r   s         r,   r&  z'DPTReassembleStage._init_reassemble_dpty  s    U3v'?'?#@A6C\C\] 	pIAvKK1&6C[C[\]C^gmno	p )+$&MMOD!3F;K3v7789 %%,,MM"))AO["I6RXRcRcKde ,r+   r1   rq   c                    g }t        |      D ]  \  }}|| j                  vr|dddf   |ddddf   }}|j                  \  }}	}
|||j                  ||||
      }n"t	        |	dz        }|j                  ||||
      }|j                  dddd      j                         }|j                  }| j                  j                  dk(  r|j                  d      j                  d      }|j                  d      j                  |      } | j                  |   t        j                  ||fd	            }|j                  ddd      j                  |      }nM| j                  j                  d
k(  r4|j                  d      |j                  d	      z   }|j                  |      } | j                  |   |      }|j!                  |        |S )z
        Args:
            hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
                List of hidden states from the backbone.
        Nr   r   rY   r   rZ   r+  )r   rZ   r   r6   add)r  r'  ru   r`   r   ra   r   rQ   r2  rw   	unsqueeze	expand_asr3  r'   rd   r#  r/  )rP   r1   patch_heightpatch_widthoutr  hidden_staterN   rz   sequence_lengthr>   r\   feature_shapereadouts                 r,   r   zDPTReassembleStage.forward  s    (7 	%OA|///*6q!t*<l1ab5>Q<	<H<N<N9
O\+0G#/#7#7
LR]_k#lL$_c%9:D#/#7#7
D$P\#]L+33Aq!Q?JJL , 2 2;;++y8#/#7#7#:#B#B9#ML'11!4>>|LG#;4#8#8#;EII|U\F]_a<b#cL#/#7#71a#@#H#H#WL[[--6#/#7#7#:Y=P=PQS=T#TL#/#7#7#FL-t{{1~l;JJ|$3	%6 
r+   NN)r#   r$   r%   r&   r;   r%  r&  r   r'   r   r   r   r   s   @r,   r!  r!  C  s@    
<4
#T%,,%7 #aefkfrfras #r+   r!  c                 z    | j                   $| j                  du r| j                   j                  S | j                  S r   )backbone_configr$  r?   )rQ   s    r,   r4  r4    s9    )f.>.>%.G%%111!!!r+   c                   $     e Zd Z fdZd Z xZS )r1  c           	      \   t         |           t        |      }t        j                  ||d      | _        |dkD  r t        j                  ||||d      | _        y |dk(  rt        j                         | _        y |dk  r,t        j                  ||dt        d|z        d      | _        y y )Nr   )in_channelsout_channelsr9   r   r9   r   paddingr   )
r:   r;   r4  r
   rJ   rK   ConvTranspose2dresizer0  r   )rP   rQ   rE   r*  r?   rV   s        r,   r;   zDPTReassembleLayer.__init__  s    /7))(`ab A:,,XxV\blmnDKq[++-DKaZ))HhAcRSV\R\oghiDK r+   c                 J    | j                  |      }| j                  |      }|S r   )rK   rL  )rP   r>  s     r,   r   zDPTReassembleLayer.forward  s$    |4{{<0r+   r#   r$   r%   r;   r   r   r   s   @r,   r1  r1    s    jr+   r1  c                   $     e Zd Z fdZd Z xZS )DPTFeatureFusionStagec                     t         |           t        j                         | _        t        t        |j                              D ]&  }| j                  j                  t        |             ( y r   )
r:   r;   r
   r  r#  r  rF   r-  r/  DPTFeatureFusionLayerr  s      r,   r;   zDPTFeatureFusionStage.__init__  sR    mmos63345 	>AKK4V<=	>r+   c                     |d d d   }g }d }t        || j                        D ]*  \  }}|	 ||      }n	 |||      }|j                  |       , |S )Nr6   )r,  r#  r/  )rP   r1   fused_hidden_statesfused_hidden_stater>  r  s         r,   r   zDPTFeatureFusionStage.forward  sq    %dd+ !#&}dkk#B 	;L%!)%*<%8"%*+=|%L"&&'9:	; #"r+   rN  r   s   @r,   rP  rP    s    >#r+   rP  c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )DPTPreActResidualLayerz
    ResidualConvUnit, pre-activate residual unit.

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
    c                 l   t         |           |j                  | _        |j                  |j                  n| j                   }t        j                         | _        t        j                  |j                  |j                  ddd|      | _
        t        j                         | _        t        j                  |j                  |j                  ddd|      | _        | j                  rIt        j                  |j                        | _        t        j                  |j                        | _        y y )Nr   r   )r9   r   rJ  r   )r:   r;   !use_batch_norm_in_fusion_residualuse_batch_normuse_bias_in_fusion_residualr
   ReLUactivation1rJ   fusion_hidden_sizeconvolution1activation2convolution2BatchNorm2dbatch_norm1batch_norm2)rP   rQ   r[  rV   s      r,   r;   zDPTPreActResidualLayer.__init__  s   $FF 11= ..((( 	$ 779II%%%%,
 779II%%%%,
 !~~f.G.GHD!~~f.G.GHD r+   r>  rq   c                    |}| j                  |      }| j                  |      }| j                  r| j                  |      }| j	                  |      }| j                  |      }| j                  r| j                  |      }||z   S r   )r]  r_  rZ  rc  r`  ra  rd  rP   r>  residuals      r,   r   zDPTPreActResidualLayer.forward  s    ''5((6++L9L''5((6++L9Lh&&r+   )	r#   r$   r%   r&   r;   r'   r   r   r   r   s   @r,   rW  rW    s*     ID'ELL 'U\\ 'r+   rW  c                   ,     e Zd ZdZd fd	ZddZ xZS )rR  a3  Feature fusion layer, merges feature maps from different stages.

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
        align_corners (`bool`, *optional*, defaults to `True`):
            The align_corner setting for bilinear upsample.
    c                     t         |           || _        t        j                  |j
                  |j
                  dd      | _        t        |      | _        t        |      | _	        y )Nr   T)r9   r   )
r:   r;   align_cornersr
   rJ   r^  rK   rW  residual_layer1residual_layer2)rP   rQ   rj  rV   s      r,   r;   zDPTFeatureFusionLayer.__init__'  sT    *))F$=$=v?X?Xfgnrs5f=5f=r+   c                    |l|j                   |j                   k7  r?t        j                  j                  ||j                   d   |j                   d   fdd      }|| j	                  |      z   }| j                  |      }t        j                  j                  |dd| j                        }| j                  |      }|S )NrZ   r   r[   Fr\   r]   rj  scale_factorr]   rj  )ru   r
   rb   rc   rk  rl  rj  rK   rf  s      r,   r   zDPTFeatureFusionLayer.forward1  s    !!X^^3==44L$6$6q$9<;M;Ma;P#QXbrw 5  ($*>*>x*HHL++L9}}00qzI[I[ 1 
 |4r+   Tr   r   r   s   @r,   rR  rR    s    >r+   rR  c                   .    e Zd ZdZeZdZdZdZdZ	dZ
d Zy)DPTPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    dptrn   Tc                    t        |t        j                  t        j                  t        j                  f      rl|j
                  j                  j                  d| j                  j                         |j                  |j                  j                  j                          nct        |t        j                        rI|j                  j                  j                          |j
                  j                  j                  d       t        |t        t        f      rI|j                   j                  j                          |j"                  j                  j                          yy)zInitialize the weightsr   )meanstdNg      ?)r@   r
   r   rJ   rK  weightdatanormal_rQ   initializer_ranger   zero_r  fill_r   r4   rN   rO   )rP   r   s     r,   _init_weightsz DPTPreTrainedModel._init_weightsO  s    fryy"))R5G5GHI MM&&CT[[5R5R&S{{&  &&(-KK""$MM$$S)f/1GHI!!'')&&++113 Jr+   N)r#   r$   r%   r&   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_2r~  r*   r+   r,   rs  rs  B  s/    
 L$O&*#N!4r+   rs  aE  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`ViTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aP  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
            for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
z]The bare DPT Model transformer outputting raw hidden-states without any specific head on top.c                        e Zd Zd fd	Zd Zd Z ee       ee	e
ede      	 	 	 	 ddej                  deej                     dee   d	ee   d
ee   deee
f   fd              Z xZS )DPTModelc                 T   t         |   |       || _        |j                  rt	        |      | _        nt        |      | _        t        |      | _        t        j                  |j                  |j                        | _        |rt        |      nd | _        | j!                          y )Nr   )r:   r;   rQ   r$  r4   r   r   r	  encoderr
   r  r?   r  	layernormDPTViTPoolerpooler	post_init)rP   rQ   add_pooling_layerrV   s      r,   r;   zDPTModel.__init__  s      4V<DO.v6DO$V,f&8&8f>S>ST.?l6*T 	r+   c                 r    | j                   j                  r| j                  S | j                  j                  S r   )rQ   r$  r   r   )rP   s    r,   get_input_embeddingszDPTModel.get_input_embeddings  s)    ;;  ??"??333r+   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r  r   r   )rP   heads_to_pruner  r   s       r,   _prune_headszDPTModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr+   vision)
checkpointoutput_typer  modalityexpected_outputrn   r   r   r   rp   rq   c                 Z   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  || j                   j
                        }| j                  ||      }|s|d   n|j                  }| j                  |||||      }|d   }	| j                  |	      }	| j                  | j                  |	      nd }
|s|
|	|
fn|	f}||dd  z   |dd  z   S t        |	|
|j                  |j                  |j                        S )N)rp   r   r   r   r   rp   r   )r/   r0   r1   r2   r"   )rQ   r   r   use_return_dictget_head_maskr  r   r!   r  r  r  r.   r1   r2   r"   )rP   rn   r   r   r   rp   embedding_outputembedding_last_hidden_statesencoder_outputssequence_outputpooled_outputhead_outputss               r,   r   zDPTModel.forward  sU     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] &&y$++2O2OP	??<[?QBM'7':ScSvSv$,,(/!5# ' 
 *!,..98<8OO4UY?L?XO];_n^pL/!""558H8LLLC-')77&11%5%N%N
 	
r+   rq  )NNNN)r#   r$   r%   r;   r  r  r   DPT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr.   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr'   r(   r   r   r	   r   r   r   r   s   @r,   r  r    s    
"4C ++?@&H$. 26,0/3&*/
''/
 E--./
 $D>	/

 'tn/
 d^/
 
uJJ	K/
 A/
r+   r  c                   *     e Zd Zdef fdZd Z xZS )r  rQ   c                     t         |           t        j                  |j                  |j
                        | _        t        |j                     | _	        y r   )
r:   r;   r
   r   r?   pooler_output_sizer   r   
pooler_act
activationr   s     r,   r;   zDPTViTPooler.__init__  s>    YYv1163L3LM
 !2!23r+   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r  )rP   r1   first_token_tensorr  s       r,   r   zDPTViTPooler.forward  s6     +1a40

#566r+   )r#   r$   r%   r   r;   r   r   r   s   @r,   r  r    s    4y 4
r+   r  c                   h     e Zd ZdZ fdZddeej                     deej                     fdZ xZ	S )DPTNecka;  
    DPTNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
    input and produces another list of tensors as output. For DPT, it includes 2 stages:

    * DPTReassembleStage
    * DPTFeatureFusionStage.

    Args:
        config (dict): config dict.
    c           
         t         |           || _        |j                   |j                  j                  dv rd | _        nt        |      | _        t        j                         | _	        |j                  D ]?  }| j                  j                  t        j                  ||j                  ddd             A t        |      | _        y )N)swinv2r   r   Fr9   rJ  r   )r:   r;   rQ   rD  
model_typereassemble_stager!  r
   r  convsr-  r/  rJ   r^  rP  fusion_stage)rP   rQ   channelrV   s      r,   r;   zDPTNeck.__init__  s     !!-&2H2H2S2SWa2a$(D!$6v$>D!]]_
// 	sGJJbii1J1JXYcdkpqr	s 2&9r+   r1   rq   c                    t        |t        t        f      st        d      t	        |      t	        | j
                  j                        k7  rt        d      | j                  | j                  |||      }t        |      D cg c]  \  }} | j                  |   |       }}}| j                  |      }|S c c}}w )z
        Args:
            hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
                List of hidden states from the backbone.
        z2hidden_states should be a tuple or list of tensorszOThe number of hidden states should be equal to the number of neck hidden sizes.)r@   r  list	TypeErrorrF   rQ   r-  rG   r  r  r  r  )rP   r1   r;  r<  r  featurer~   r   s           r,   r   zDPTNeck.forward  s     -%7PQQ}T[[%B%B!CCnoo   , 11-{[M=F}=UVzq'MDJJqM'*VV ""8, Ws   B:rB  
r#   r$   r%   r&   r;   r   r'   r   r   r   r   s   @r,   r  r    s6    	:"T%,,%7 aefkfrfras r+   r  c                   `     e Zd ZdZ fdZdeej                     dej                  fdZ xZ	S )DPTDepthEstimationHeada	  
    Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
    the predictions to the input resolution after the first convolutional layer (details can be found in the paper's
    supplementary material).
    c                    t         |           || _        d | _        |j                  rt        j                  ddddd      | _        |j                  }t        j                  t        j                  ||dz  ddd      t        j                  ddd	
      t        j                  |dz  dddd      t        j                         t        j                  ddddd      t        j                               | _        y )N   )r   r   )r   r   rI  rZ   r   r   r[   Tro      r   )r:   r;   rQ   rK   add_projectionr
   rJ   r^  r5  Upsampler\  headrP   rQ   r~   rV   s      r,   r;   zDPTDepthEstimationHead.__init__+  s       iiSfV]cdDO,,MMIIhA1QPQRKKQZtLIIh!mRQq!LGGIIIb!1a@GGI
	r+   r1   rq   c                     || j                   j                     }| j                  +| j                  |      } t        j                         |      }| j                  |      }|j                  d      }|S )Nr   r^   )rQ   head_in_indexrK   r
   r\  r  squeeze)rP   r1   predicted_depths      r,   r   zDPTDepthEstimationHead.forward>  sg    %dkk&?&?@??& OOM:M%BGGIm4M))M2)11a18r+   r  r   s   @r,   r  r  $  s-    
&T%,,%7 ELL r+   r  zu
    DPT Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
    c                        e Zd Z fdZ ee       eee      	 	 	 	 	 dde	j                  dee	j                     dee	j                     dee   dee   dee   d	eee	j                      ef   fd
              Z xZS )DPTForDepthEstimationc                 $   t         |   |       d | _        |j                  du r)|j                  |j                  t        |      | _        nt        |d      | _        t        |      | _	        t        |      | _        | j                          y NF)r  )r:   r;   rD   r$  rD  r   r  rt  r  neckr  r  r  r   s     r,   r;   zDPTForDepthEstimation.__init__T  s}     u$&*@*@*LPVP_P_Pk)&1DM%@DH FO	 +62	 	r+   r  r  rn   r   labelsr   r   rp   rq   c                 f    d}|t        d      ||n j                  j                  }||n j                  j                  }||n j                  j                  } j
                  + j
                  j                  |||      }|j                  }	n j                  |||d|      }|r|j                  n|d   }	 j                  j                  s:t        |	dd       D 
cg c]   \  }
}|
 j                  j                  v s|" }	}
}nD|r|j                  nt        |d         }|j                   fdt        |	dd       D               |}	d	\  }} j                  j                   S j                  j                  d
u r;|j"                  \  }}}} j                  j                   j$                  }||z  }||z  } j'                  |	||      }	 j)                  |	      }|s|r
|f|dd z   }n	|f|dd z   }||f|z   S |S t+        |||r|j                  nd|j,                        S c c}}
w )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth depth estimation maps for computing the loss.

        Returns:

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DPTForDepthEstimation
        >>> import torch
        >>> import numpy as np
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large")
        >>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")

        >>> # prepare image for the model
        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # interpolate to original size
        >>> post_processed_output = image_processor.post_process_depth_estimation(
        ...     outputs,
        ...     target_sizes=[(image.height, image.width)],
        ... )

        >>> # visualize the prediction
        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
        >>> depth = predicted_depth * 255 / predicted_depth.max()
        >>> depth = depth.detach().cpu().numpy()
        >>> depth = Image.fromarray(depth.astype("uint8"))
        ```NzTraining is not implemented yet)r   r   Tr  r   r6   c              3   ^   K   | ]$  \  }}|j                   j                  d d v r| & ywrZ   NrQ   backbone_out_indicesr  idxr  rP   s      r,   r  z0DPTForDepthEstimation.forward.<locals>.<genexpr>  s6      .$Wdkk>>qrBB .s   *-rB  FrZ   )lossr  r1   r2   )NotImplementedErrorrQ   r  r   r   rD   forward_with_filtered_kwargsrv   rt  r1   r$  r  r  r"   r  extendrD  ru   r=   r  r  r   r2   )rP   rn   r   r  r   r   rp   r  r   r1   r  r  backbone_hidden_statesr;  r<  r   r{   r|   r=   r  r   s   `                    r,   r   zDPTForDepthEstimation.forwardf  st   b %&GHH%0%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq==$mm@@3G[l A G $00Mhh#"3%)'  G 6AG11gajM ;;((09-:K0L! ,WPSW[WbWbWwWwPwG! ! NY)I)I^bcjkmcn^o&&-- .(1-2C(D.  !7$.!k;;&&2t{{7L7LPU7U"."4"4Aq&%44??J!Z/L:-K		-{K))M2#)+gabk9)+gabk9)-)9TGf$EvE#+3G'//T))	
 	
?!s   & H-H-)NNNNN)r#   r$   r%   r;   r   r  r   r   r  r'   r(   r   
LongTensorr   r	   r   r   r   r   r   s   @r,   r  r  M  s    $ ++?@+?o^ 26-1,0/3&*n
''n
 E--.n
 ))*	n

 $D>n
 'tnn
 d^n
 
uU\\"$88	9n
 _ An
r+   r  c                   \     e Zd Z fdZdeej                     dej                  fdZ xZS )DPTSemanticSegmentationHeadc                    t         |           || _        |j                  }t	        j
                  t	        j                  ||ddd      t	        j                  |      t	        j                         t	        j                  |j                        t	        j                  ||j                  d      t	        j                  ddd	            | _        y )
Nr   r   Fr  r8   rZ   r[   Tro  )r:   r;   rQ   r^  r
   r5  rJ   rb  r\  r   semantic_classifier_dropout
num_labelsr  r  r  s      r,   r;   z$DPTSemanticSegmentationHead.__init__  s    ,,MMIIhaONN8$GGIJJv99:IIh 1 1qAKKQZtL
	r+   r1   rq   c                 Z    || j                   j                     }| j                  |      }|S r   )rQ   r  r  rP   r1   logitss      r,   r   z#DPTSemanticSegmentationHead.forward  s)    %dkk&?&?@=)r+   )	r#   r$   r%   r;   r   r'   r   r   r   r   s   @r,   r  r    s(    
T%,,%7 ELL r+   r  c                   $     e Zd Z fdZd Z xZS )DPTAuxiliaryHeadc                 X   t         |           |j                  }t        j                  t        j
                  ||ddd      t        j                  |      t        j                         t        j                  dd      t        j
                  ||j                  d            | _
        y )Nr   r   Fr  g?r8   )r:   r;   r^  r
   r5  rJ   rb  r\  r   r  r  r  s      r,   r;   zDPTAuxiliaryHead.__init__  sv    ,,MMIIhaONN8$GGIJJsE"IIh 1 1qA
	r+   c                 (    | j                  |      }|S r   )r  r  s      r,   r   zDPTAuxiliaryHead.forward  s    =)r+   rN  r   s   @r,   r  r    s    

r+   r  zY
    DPT Model with a semantic segmentation head on top e.g. for ADE20k, CityScapes.
    c                       e Zd Z fdZ ee       eee      	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     de	e   de	e   de	e   d	eee
j                      ef   fd
              Z xZS )DPTForSemanticSegmentationc                     t         |   |       t        |d      | _        t	        |      | _        t        |      | _        |j                  rt        |      nd | _
        | j                          y r  )r:   r;   r  rt  r  r  r  r  use_auxiliary_headr  auxiliary_headr  r   s     r,   r;   z#DPTForSemanticSegmentation.__init__  s^     Fe< FO	 07	:@:S:S.v6Y] 	r+   r  rn   r   r  r   r   rp   rq   c                     ||n j                   j                  }||n j                   j                  }|$ j                   j                  dk(  rt	        d       j                  |||d|      }|r|j                  n|d   } j                   j                  s:t        |dd       D 	
cg c]   \  }	}
|	 j                   j                  v s|
" }}	}
nD|r|j                  nt        |d         }|j                   fdt        |dd       D               |} j                  |      } j                  |      }d} j                   j                  |d         }d}|t         j"                  j%                  ||j&                  d	d d
d      }|0t         j"                  j%                  ||j&                  d	d d
d      }t)         j                   j*                        } |||      } ||      }| j                   j,                  |z  z   }|s|r
|f|dd z   }n	|f|dd z   }||f|z   S |S t/        |||r|j                  nd|j0                        S c c}
}	w )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DPTForSemanticSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large-ade")
        >>> model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr  r6   c              3   `   K   | ]%  \  }}|j                   j                  d d v s"| ' ywr  r  r  s      r,   r  z5DPTForSemanticSegmentation.forward.<locals>.<genexpr>X  s7      *(CCSWS^S^SsSstutvSwLw*s   #..)r1   r7   r[   Frn  )ignore_indexrZ   )r  r  r1   r2   )rQ   r  r   r  rG   rt  r1   r$  r  r  r"   r  r  r  r  r  r
   rb   rc   ru   r   semantic_loss_ignore_indexauxiliary_loss_weightr   r2   )rP   rn   r   r  r   r   rp   r   r1   r  r  r  r  auxiliary_logitsr  upsampled_logitsupsampled_auxiliary_logitsloss_fct	main_lossauxiliary_lossr   s   `                    r,   r   z"DPTForSemanticSegmentation.forward  s   F &1%<k$++B]B]$8$D $++JjJj 	 $++"8"8A"=NOO((/!%#  
 2=--'!* {{$$,5mAB6G,H(CCSWS^S^SsSsLsM  JUW%E%EZ^_fgi_jZk"")) *,5mAB6G,H*  3M			>=)*#22=3DE!}}88V\\"#.Zu  9    +-/]]-F-F$6<<+<:]b .G .* (T[[5[5[\H !16:I%&@&INt{{@@>QQD# WQR[0 WQR[0)-)9TGf$EvE&3G'//T))	
 	
Us   , II)NNNNNN)r#   r$   r%   r;   r   r  r   r   r  r   r'   r(   r  r   r	   r   r   r   r   r   s   @r,   r  r    s     ++?@+BQ`a 5915-1,0/3&*e
u001e
 E--.e
 ))*	e

 $D>e
 'tne
 d^e
 
uU\\"$;;	<e
 b Ae
r+   r  )r  r  r  rs  )r   )Rr&   collections.abcrA   dataclassesr   typingr   r   r   r   r   r	   r'   torch.utils.checkpointr
   torch.nnr   activationsr   
file_utilsr   r   r   r   modeling_outputsr   r   r   modeling_utilsr   r   pytorch_utilsr   r   utilsr   r   r   utils.backbone_utilsr   configuration_dptr   
get_loggerr#   r   r  r  r  r    r.   Moduler4   r   r   r   floatr   r   r   r   r   r   r   r	  r!  r4  r1  rP  rW  rR  rs  DPT_START_DOCSTRINGr  r  r  r  r  r  r  r  r  __all__r*   r+   r,   <module>r     s    ! > >    % !  _ ^ F Q 4 4 1 ( 
		H	%  ( '  M M M   M;  M  MF`
RYY `
F7Yryy 7YtBII N %II%<<% 
% <<	%
 U\\*% % %>;ryy ;~ryy $&bii &T "299  '")) 'V0
BII 0
fe eP" ,#BII #0:'RYY :'z"BII "J4 4:	  . cW
! W
	W
v299 2bii 2j&RYY &R  	C
. C
C
L")) 2ryy &  	w
!3 w
w
t dr+   