
    %	&h|9                         d Z ddlmZ ddlmZ  ej
                  e      Z G d de      Z G d de      Z	 G d d	e      Z
 G d
 de      Zg dZy)zSAM model configuration   )PretrainedConfig)loggingc                   6     e Zd ZdZdZ	 	 	 	 	 	 	 d fd	Z xZS )SamPromptEncoderConfiga  
    This is the configuration class to store the configuration of a [`SamPromptEncoder`]. The [`SamPromptEncoder`]
    module is used to encode the input 2D points and bounding boxes. Instantiating a configuration defaults will yield
    a similar configuration to that of the SAM-vit-h
    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the hidden states.
        image_size (`int`, *optional*, defaults to 1024):
            The expected output resolution of the image.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        mask_input_channels (`int`, *optional*, defaults to 16):
            The number of channels to be fed to the `MaskDecoder` module.
        num_point_embeddings (`int`, *optional*, defaults to 4):
            The number of point embeddings to be used.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the encoder and pooler.
    prompt_encoder_configc                     t        	|   di | || _        || _        || _        ||z  | _        || _        || _        || _        || _	        y N )
super__init__hidden_size
image_size
patch_sizeimage_embedding_sizemask_input_channelsnum_point_embeddings
hidden_actlayer_norm_eps)
selfr   r   r   r   r   r   r   kwargs	__class__s
            /var/www/pru.catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/sam/configuration_sam.pyr   zSamPromptEncoderConfig.__init__3   sW     	"6"&$$$.*$<!#6 $8!$,    )         r      geluư>__name__
__module____qualname____doc__base_config_keyr   __classcell__r   s   @r   r   r      s0    0 .O - -r   r   c                   <     e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )SamMaskDecoderConfiga  
    This is the configuration class to store the configuration of a [`SamMaskDecoder`]. It is used to instantiate a SAM
    mask decoder to the specified arguments, defining the model architecture. Instantiating a configuration defaults
    will yield a similar configuration to that of the SAM-vit-h
    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the hidden states.
        hidden_act (`str`, *optional*, defaults to `"relu"`):
            The non-linear activation function used inside the `SamMaskDecoder` module.
        mlp_dim (`int`, *optional*, defaults to 2048):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 2):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        attention_downsample_rate (`int`, *optional*, defaults to 2):
            The downsampling rate of the attention layer.
        num_multimask_outputs (`int`, *optional*, defaults to 3):
            The number of outputs from the `SamMaskDecoder` module. In the Segment Anything paper, this is set to 3.
        iou_head_depth (`int`, *optional*, defaults to 3):
            The number of layers in the IoU head module.
        iou_head_hidden_dim (`int`, *optional*, defaults to 256):
            The dimensionality of the hidden states in the IoU head module.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.

    mask_decoder_configc                     t        |   di | || _        || _        || _        || _        || _        || _        || _        || _	        |	| _
        |
| _        y r	   )r   r   r   r   mlp_dimnum_hidden_layersnum_attention_headsattention_downsample_ratenum_multimask_outputsiou_head_depthiou_head_hidden_dimr   )r   r   r   r,   r-   r.   r/   r0   r1   r2   r   r   r   s               r   r   zSamMaskDecoderConfig.__init__m   sc     	"6"&$!2#6 )B&%:",#6 ,r   )
r   relui         r4   r   r   r   r   r    r'   s   @r   r)   r)   I   s:    B ,O "#- -r   r)   c                   V     e Zd ZdZdZdZddddddd	d
ddddddddg dddf fd	Z xZS )SamVisionConfiga  
    This is the configuration class to store the configuration of a [`SamVisionModel`]. It is used to instantiate a SAM
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    defaults will yield a similar configuration to that of the SAM ViT-h
    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        output_channels (`int`, *optional*, defaults to 256):
            Dimensionality of the output channels in the Patch Encoder.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            Number of channels in the input image.
        image_size (`int`, *optional*, defaults to 1024):
            Expected resolution. Target size of the resized input image.
        patch_size (`int`, *optional*, defaults to 16):
            Size of the patches to be extracted from the input image.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string)
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 1e-10):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        qkv_bias (`bool`, *optional*, defaults to `True`):
            Whether to add a bias to query, key, value projections.
        mlp_ratio (`float`, *optional*, defaults to 4.0):
            Ratio of mlp hidden dim to embedding dim.
        use_abs_pos (`bool`, *optional*, defaults to `True`):
            Whether to use absolute position embedding.
        use_rel_pos (`bool`, *optional*, defaults to `True`):
            Whether to use relative position embedding.
        window_size (`int`, *optional*, defaults to 14):
            Window size for relative position.
        global_attn_indexes (`List[int]`, *optional*, defaults to `[2, 5, 8, 11]`):
            The indexes of the global attention layers.
        num_pos_feats (`int`, *optional*, defaults to 128):
            The dimensionality of the position embedding.
        mlp_dim (`int`, *optional*):
            The dimensionality of the MLP layer in the Transformer encoder. If `None`, defaults to `mlp_ratio *
            hidden_size`.

    Example:

    ```python
    >>> from transformers import (
    ...     SamVisionConfig,
    ...     SamVisionModel,
    ... )

    >>> # Initializing a SamVisionConfig with `"facebook/sam-vit-huge"` style configuration
    >>> configuration = SamVisionConfig()

    >>> # Initializing a SamVisionModel (with random weights) from the `"facebook/sam-vit-huge"` style configuration
    >>> model = SamVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```vision_configsam_vision_modeli   r      r   r   r   r   r   g        g|=Tg      @   )r4      r5         Nc                 Z   t        |   di | || _        || _        || _        || _        || _        || _        || _        || _	        |	| _
        |
| _        || _        || _        || _        || _        || _        || _        || _        || _        |t)        ||z        | _        y || _        y r	   )r   r   r   output_channelsr-   r.   num_channelsr   r   r   r   attention_dropoutinitializer_rangeqkv_bias	mlp_ratiouse_abs_posuse_rel_poswindow_sizeglobal_attn_indexesnum_pos_featsintr,   )r   r   r@   r-   r.   rA   r   r   r   r   rB   rC   rD   rE   rF   rG   rH   rI   rJ   r,   r   r   s                        r   r   zSamVisionConfig.__init__   s    . 	"6"&.!2#6 ($$$,!2!2 "&&&#6 *7>s;23Gr   )r!   r"   r#   r$   r%   
model_typer   r&   r'   s   @r   r7   r7      s]    BH &O#J ))+T +Tr   r7   c                   <     e Zd ZdZdZeeedZ	 	 	 	 d fd	Z	 xZ
S )	SamConfiga  
    [`SamConfig`] is the configuration class to store the configuration of a [`SamModel`]. It is used to instantiate a
    SAM model according to the specified arguments, defining the vision model, prompt-encoder model and mask decoder
    configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the
    SAM-ViT-H [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vision_config (Union[`dict`, `SamVisionConfig`], *optional*):
            Dictionary of configuration options used to initialize [`SamVisionConfig`].
        prompt_encoder_config (Union[`dict`, `SamPromptEncoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`SamPromptEncoderConfig`].
        mask_decoder_config (Union[`dict`, `SamMaskDecoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`SamMaskDecoderConfig`].

        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```python
    >>> from transformers import (
    ...     SamVisionConfig,
    ...     SamPromptEncoderConfig,
    ...     SamMaskDecoderConfig,
    ...     SamModel,
    ... )

    >>> # Initializing a SamConfig with `"facebook/sam-vit-huge"` style configuration
    >>> configuration = SamConfig()

    >>> # Initializing a SamModel (with random weights) from the `"facebook/sam-vit-huge"` style configuration
    >>> model = SamModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a SamConfig from a SamVisionConfig, SamPromptEncoderConfig, and SamMaskDecoderConfig

    >>> # Initializing SAM vision, SAM Q-Former and language model configurations
    >>> vision_config = SamVisionConfig()
    >>> prompt_encoder_config = SamPromptEncoderConfig()
    >>> mask_decoder_config = SamMaskDecoderConfig()

    >>> config = SamConfig(vision_config, prompt_encoder_config, mask_decoder_config)
    ```sam)r   r*   r8   c                 v   t        |   di | ||ni }||ni }||ni }t        |t              r|j	                         }t        |t
              r|j	                         }t        |t              r|j	                         }t        di || _        t        di || _        t        di || _	        || _
        y r	   )r   r   
isinstancer7   to_dictr   r)   r8   r   r*   rC   )r   r8   r   r*   rC   r   r   s         r   r   zSamConfig.__init__7  s     	"6")6)B9N9Z 5`b5H5T1Z\m_5)113M+-CD$9$A$A$C!)+?@"5"="="?,=}=%;%T>S%T"#7#N:M#N !2r   )NNNg{Gz?)r!   r"   r#   r$   rL   r   r)   r7   sub_configsr   r&   r'   s   @r   rN   rN      s8    /b J!73(K " 3 3r   rN   )rN   r)   r   r7   N)r$   configuration_utilsr   utilsr   
get_loggerr!   loggerr   r)   r7   rN   __all__r
   r   r   <module>rY      sj     3  
		H	%.-- .-b<-+ <-~sT& sTlP3  P3f ]r   