
    %	&h3                     6   d dl mZmZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
mZ ddlmZmZ dd	lmZ d
dlmZ  ej&                  e      Z G d de      Z G d dej.                        Z G d dej.                        Z G d de      Z G d de      ZddgZy)    )ListOptionalTupleUnionN)nn   )ACT2FN)is_torchdynamo_compilinglogging   )LlavaCausalLMOutputWithPastLlavaForConditionalGeneration)MistralRMSNorm   )Mistral3Configc                       e Zd Zy)Mistral3RMSNormN__name__
__module____qualname__     /var/www/pru.catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/mistral3/modular_mistral3.pyr   r          r   r   c                   x     e Zd ZdZdef fdZdej                  dej                  dej                  fdZ xZ	S )Mistral3PatchMergerz<
    Learned merging of spatial_merge_size ** 2 patches
    configc                 "   t         |           || _        |j                  j                  }|j
                  | _        | j                  j                  j                  | _        t        j                  || j
                  dz  z  |d      | _	        y )Nr   Fbias)
super__init__r   vision_confighidden_sizespatial_merge_size
patch_sizer   Linearmerging_layer)selfr   r%   	__class__s      r   r#   zMistral3PatchMerger.__init__(   sr    **66"(";";++33>>YY{T5L5La5O'OQ\chir   image_featuresimage_sizesreturnc                    |D cg c]&  }|d   | j                   z  |d   | j                   z  f( }}|D cg c]
  \  }}||z   }}}|j                  d   }g }t        |j                  |            D ]  \  }	}
||	   \  }}|
j	                  |||      j                  ddd      j                  d      }t        j                  j                  j                  || j                  | j                        }|j	                  || j                  dz  z  d      j                         }|j                  |        t        j                  |d      }| j                  |      }|S c c}w c c}}w )Nr   r   r   )kernel_sizestridedim)r'   shape	enumeratesplitviewpermute	unsqueezetorchr   
functionalunfoldr&   tappendcatr)   )r*   r,   r-   
image_sizehwtokens_per_imagedpermuted_tensorimage_indeximage_tokens
image_gridgrids                r   forwardzMistral3PatchMerger.forward1   sl   cn
U_Z]doo-z!}/OP
 
 /::daAE::  $)2>3G3GHX3Y)Z 	)%K{+DAq%**1a3;;Aq!DNNqQJ88&&--(?(?H_H_ . D 99Q!8!8!!;;R@BBDD""4(	) ?:++N;)
 ;s
   +E"E')
r   r   r   __doc__r   r#   r;   TensorrK   __classcell__r+   s   @r   r   r   #   s?    j~ jell  RWR^R^ r   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Mistral3MultiModalProjectorr   c                 ^   t         |           t        |j                  j                        | _        t        |      | _        t        |j                  t              rdnt        |j                        }t        j                  |j                  j                  |z  |j                  j                  |j                        | _        t"        |j$                     | _        t        j                  |j                  j                  |j                  j                  |j                        | _        y )Nr   r    )r"   r#   r   r$   r%   normr   patch_merger
isinstancevision_feature_layerintlenr   r(   text_configmultimodal_projector_biaslinear_1r	   projector_hidden_actactlinear_2)r*   r   num_feature_layersr+   s      r   r#   z$Mistral3MultiModalProjector.__init__J   s    #F$8$8$D$DE	/7",V-H-H#"NQTWX^XsXsTt		  ,,/AA**11

 &556		**F,>,>,J,JQWQqQq
r   r,   r-   c                     | j                  |      }| j                  ||      }| j                  |      }| j                  |      }| j	                  |      }|S )N)rS   rT   r[   r]   r^   )r*   r,   r-   hidden_statess       r   rK   z#Mistral3MultiModalProjector.forwardZ   sR    >2**>;Gn5/m4r   )	r   r   r   r   r#   r;   rM   rK   rN   rO   s   @r   rQ   rQ   I   s*    
~ 
 ell  r   rQ   c                       e Zd Zy)Mistral3CausalLMOutputWithPastNr   r   r   r   rc   rc   c   r   r   rc   c            #          e Zd Zdej                  deeee   f   dej                  fdZ		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dde
ej                     de
ej                     de
ej                     de
ej                     d	e
eej                        d
e
ej                     de
eeee   f      de
ej                     de
e   de
e   de
e   de
e   de
ej                     deeej                  f   de
ej                     deeef   f dZy) Mistral3ForConditionalGenerationpixel_valuesrV   r-   c                 |   |j                         D ci c]  \  }}|	|| }}} | j                  |f|dd|}t        |t              r|j                  |   }n3|D 	cg c]  }	|j                  |	    }
}	t        j                  |
d      }| j                  |j                  d      |      }|S c c}}w c c}	w )a=  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
               The tensors corresponding to the input images.
            vision_feature_layer (`Union[int, List[int]]`):
                The index of the layer to select the vision feature. If multiple indices are provided,
                the vision feature of the corresponding indices will be concatenated to form the
                vision features.
            image_sizes (`torch.Tensor`):
                Tensor containing the image sizes as returned by the processor.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        T)r-   output_hidden_statesr0   r3   r   )	itemsvision_towerrU   rW   ra   r;   r@   multi_modal_projectorsqueeze)r*   rf   rV   r-   kwargskvimage_outputsselected_image_feature	layer_idxhs_poolr,   s               r   get_image_featuresz3Mistral3ForConditionalGeneration.get_image_featuresh   s    , $*<<>C41aQ]!Q$CC))),uKfjuntu *C0%2%@%@AU%V"Ocd)}229=dGd%*YYwB%?"334J4R4RST4UWbc D es   
B3B3!B9N	input_idsattention_maskposition_idspast_key_valuesinputs_embedslabels	use_cacheoutput_attentionsrh   return_dictcache_positionlogits_to_keepr.   c                    |
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|du |duz  rt        d      ||t        d      | | j                         |      }|#| j                  |||      }|| j                   j                  k(  j                  d      }|j                  |      j                  |j                        }t               s{||   j                         |j                         k7  rW|| j                   j                  k(  j                         }|j                   d   |j                   d   z  }t        d| d	|       |j                  |j                  |j"                        }|j%                  ||      } | j&                  d|||||	|
||||d

|}|d   }d}|<||dd|j                   d   dz
   df   j                  |j                        }|dddddf   |j                  |j                        dk7     j)                         }|dddf   |j                  |j                        dk7     j)                         }n1|dddddf   j)                         }|dddf   j)                         }t+        j,                         } ||j/                  d|j1                  d            |j/                  d      j                  |j                              }|s|f|dd z   }||f|z   S |S t3        |||j4                  |j6                  |j8                  |      S d      S )a<  
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

            logits_to_keep (`int` or `torch.Tensor`, *optional*):
                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
                This is useful when using packed tensor format (single dimension for batch and sequence length).


        Returns:

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Mistral3ForConditionalGeneration

        >>> model = Mistral3ForConditionalGeneration.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")
        >>> processor = AutoProcessor.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")

        >>> prompt = "<s>[INST][IMG]What is the image?[/INST]"
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is the image?The image depicts two cats lying on a pink blanket."
        ```Nz:You must specify exactly one of input_ids or inputs_embedszdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one)rf   rV   r-   r0   r   r   z6Image features and image tokens do not match: tokens: z, features )
rv   rw   rx   ry   r{   r|   rh   r}   r~   r   .)losslogitsrx   ra   
attentionsimage_hidden_statesr   )r   r|   rh   use_return_dictrV   
ValueErrorget_input_embeddingsrt   image_token_indexr:   	expand_astodevicer
   numelsumr5   dtypemasked_scatterlanguage_model
contiguousr   CrossEntropyLossr8   sizerc   rx   ra   r   )r*   ru   rf   rv   rw   rx   ry   rV   rz   r{   r|   rh   r}   r~   r   r-   	lm_kwargsr,   special_image_maskn_image_tokensn_image_featuresoutputsr   r   shift_attention_maskshift_logitsshift_labelsloss_fctoutputs                                r   rK   z(Mistral3ForConditionalGeneration.forward   s   r 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$8$D $++JjJj 	 -t";<YZZ#(Av   7D557	BM#!44)%9' 5 N #,t{{/L/L"L!W!WXZ![!3!=!=m!L!O!OP]PdPd!e+--@R2S2Y2Y2[_m_s_s_u2u"+t{{/L/L"L!Q!Q!S#1#7#7#:^=Q=QRS=T#T  L^L\\ghxgyz  ,..}/C/C]EXEXYN)889K^\M%$%% 
)%+'/!5#))
 
 ) (6a6<<?Q;N9O9Q6Q'R'U'UV\VcVc'd$%c3B3k23G3J3J6==3Y]^3^_jjl%c12g/C/F/Fv}}/UYZ/Z[ffh%c3B3k2==?%c12g99;**,H!!"l&7&7&;<l>O>OPR>S>V>VWcWjWj>kD Y,F'+'7D7V#CVC-#33!//))2>2J
 	
 QU
 	
r   )NNNNNNNNNNNNNr   N)r   r   r   r;   FloatTensorr   rW   r   rM   rt   r   
LongTensorboolr   rc   rK   r   r   r   re   re   g   s   "''" $CcN3" \\	"L 15481537=A59@D-1$(,0/3&*5934.2!L
E,,-L
 u001L
 !.	L

 u//0L
 "$u'8'8"9:L
   1 12L
 'uS$s)^'<=L
 ))*L
 D>L
 $D>L
 'tnL
 d^L
 !!1!12L
 c5<</0L
  ell+!L
$ 
u44	5%L
r   re   Mistral3PreTrainedModel)typingr   r   r   r   r;   r   activationsr	   utilsr
   r   llava.modeling_llavar   r   mistral.modeling_mistralr   configuration_mistral3r   
get_loggerr   loggerr   Moduler   rQ   rc   re   __all__r   r   r   <module>r      s     0 /   ! 6 ] 5 2 
		H	%	n 	#")) #L")) 4	%@ 	q
'D q
j &r   