
    biQ                     4   d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z	 ddl
mZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZ ddlmZ  ej        e          Ze G d de                      Z G d dej                   Z!	 d'dej         dej"        dej"        dej"        dej"        dz  de#de#fdZ$ G d dej                   Z% G d d ej                   Z& G d! d"e          Z' G d# d$ej                   Z( G d% d&ej                   Z)dS )(zTPyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object    N)Callable)	dataclass)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONS)Unpack)ModelOutputTransformersKwargscan_return_tuplelogging)is_flash_attention_requested   )IdeficsVisionConfigc                       e Zd ZU dZdZej        dz  ed<   dZej        dz  ed<   dZ	e
ej        df         dz  ed<   dZe
ej        df         dz  ed<   dS )IdeficsVisionModelOutputa  
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.

    Args:
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The image embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   tupler        ]/root/projects/butler/venv/lib/python3.11/site-packages/transformers/models/idefics/vision.pyr   r   )   s          * .2L%#d*11126u(4/666:>M5*C/047>>>7;Je',-4;;;;;r#   r   c                   z     e Zd Zdef fdZdej        dededej        fdZdd	ej	        d
e
dej        fdZ xZS )IdeficsVisionEmbeddingsconfigc                 z   t                                                       || _        |j        | _        |j        | _        |j        | _        t          j        t          j
        | j                            | _        t          j        |j        | j        | j        | j        d          | _        | j        | j        z  dz  | _        | j        dz   | _        t          j        | j        | j                  | _        |                     dt          j        | j                                      d          d           d S )NF)in_channelsout_channelskernel_sizestridebias   r   position_ids)r   )
persistent)super__init__r'   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferarangeexpandselfr'   	__class__s     r$   r3   z IdeficsVisionEmbeddings.__init__H   s   + + +!|EK,G,GHH!y+? 
  
  
 !Ot>1D!-1"$,t/A4>"R"R^U\$:L-M-M-T-TU\-]-]jopppppr#   
embeddingsheightwidthreturnc                    |j         d         dz
  }|                     | j                  }|j         d         dz
  }||k    r||k    r|S |dddf         }|ddddf         }|j         d         }	|| j        j        z  }
|| j        j        z  }|
dz   |dz   }}
t          j        |          }|                    dt          |          t          |          |	          }|	                    dddd          }|j
        t          j        k    }|r9t                              d           |                    t          j                  }t"          j                            ||
|z  ||z  fd	d
          }|r|                    t          j                  }t          |
          |j         d         k    st          |          |j         d         k    rJt)          dt          |
          t          |          f d|j         d         |j         d         f d          |	                    dddd                              dd|	          }t          j        |                    d          |fd          S )a#  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
        resolution images.

        Source:
        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
        r   Nr   r0   g?r   r.   zUpcasting patch_pos_embed to fp32 for interpolation since `upsample_bicubic2d_out_frame` in nn.functional.interpolate is not implemented for 'torch.bfloat16' dtype. This will result in a slight overhead.bicubicF)scale_factormodealign_cornerszNumber of patches for images (z/) don't match the shape of position embedding ()dim)shaperA   r/   r'   r7   mathsqrtreshapeintpermutedtyper   bfloat16loggerwarning_oncetofloatr   
functionalinterpolate
ValueErrorviewcat	unsqueeze)rF   rH   rI   rJ   r>   	pos_embedr?   class_pos_embedpatch_pos_embedr5   num_h_patchesnum_w_patchessqrt_num_positionsfp32_upcastings                 r$   interpolate_pos_encodingz0IdeficsVisionEmbeddings.interpolate_pos_encoding_   s    !&q)A-++D,=>>	!*Q.-''FeOO#AAAqD/#AAAqrrE*$R(	$+"88!77 (5s':MC<O}!Y}55)11!S9K5L5LcRdNeNegpqq)11!Q1==(.%.@ 	>h   .00==O-33'*<<mN`>`a	 4 
 
  	A-00@@O}!6r!:::c->P>PTcTijlTm>m>mh]1C1CSEWEW0X h h0?0Eb0I?K`acKd/eh h h   *11!Q1==BB1b)TTy/33A66HaPPPPr#   Fpixel_valuesrn   c                 <   |j         \  }}}}|s<|| j        k    s|| j        k    r&t          d| d| d| j         d| j         d	          | j        j        j        }|                     |                    |                    }|                    d                              dd          }| j	        
                    |dd          }	t          j        |	|gd	          }
|r|
|                     |
||          z   }
n|
|                     | j                  z   }
|
S )
NzInput image size (*z) doesn't match model (z8). You should try to set `interpolate_pos_encoding=True`)r[   r.   r   r0   rS   )rU   r6   rc   r=   weightr[   r_   flatten	transposer:   rD   r   re   rn   rA   r/   )rF   ro   rn   
batch_sizer<   rI   rJ   target_dtypepatch_embedsclass_embedsrH   s              r$   forwardzIdeficsVisionEmbeddings.forward   sc   2>2D/
L&%' 	((ET_,D,D u u u% u uu u+/?u u u  
 +28++LOO,O,O,OPP#++A..88A>>+22:q"EEYl;CCC
 $ 	Q#d&C&CJPVX]&^&^^JJ#d&=&=d>O&P&PPJr#   F)r   r   r   r   r3   r   TensorrY   rn   r   boolry   __classcell__rG   s   @r$   r&   r&   G   s        q2 q q q q q q./Q5< /Q /QUX /Q]b]i /Q /Q /Q /Qb E$5 QU bgbn        r#   r&           modulequerykeyvalueattention_maskscalingdropoutc                    t          j        ||                    dd                    |z  }|||z   }t          j                            |dt           j                                      |j                  }t          j        	                    ||| j
                  }t          j        ||          }	|	                    dd                                          }	|	|fS )Nr0   rQ   )rT   r[   )ptrainingr   r.   )r   matmulrt   r   ra   softmaxfloat32r_   r[   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r$   eager_attention_forwardr      s     <s}}R'<'<==GL!#n4=((2U](SSVVW\WbccL=((6?([[L,|U33K''1--88::K$$r#   c                        e Zd ZdZdef fdZ	 	 	 ddej        dej        dz  dej        dz  d	edz  d
e	ej        ej        dz  f         f
dZ
 xZS )IdeficsVisionAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr'   c                    t                                                       || _        |j        | _        |j        | _        | j        | j        z  | _        | j        | j        z  | j        k    r t          d| j         d| j         d          | j        dz  | _	        |j
        | _        d| _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        t          j        | j        | j                  | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      F)r2   r3   r'   r4   r5   num_attention_heads	num_headshead_dimrc   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projrE   s     r$   r3   zIdeficsVisionAttention.__init__   s   +3$.8=4>)T^;;'dn ' 'N' ' '   ]D(
/i??i??i??	$.$.AAr#   NFr   r   causal_attention_maskoutput_attentionsrK   c           
      b   |j         \  }}}|                     |          }|                     |          }	|                     |          }
|                    ||| j        | j                                      dd          }|	                    ||| j        | j                                      dd          }	|
                    ||| j        | j                                      dd          }
t          | j	                  s||||z   }n||}n	|du| _
        t          j        | j	        j        t                    } || ||	|
|| j
        | j        | j        sdn| j                  \  }}|                    |||                                          }|                     |          }|sd}||fS )z#Input shape: Batch x Time x Channelr   r.   Nr   )r   r   r   )rU   r   r   r   rd   r   r   rt   r   r'   r   r   get_interface_attn_implementationr   r   r   r   rX   r   r   )rF   r   r   r   r   ru   
seq_lengthr5   querieskeysvaluesattention_interfacer   r   s                 r$   ry   zIdeficsVisionAttention.forward   s    -:,?)
J	++m,,{{=))]++,,z:t~t}UU__`acdeeyyZOOYYZ[]^__ZT^T]SS]]^_abcc ,DK88 	?).C.O!/2G!G&2!62$>DN(?(MK,.E)
 )
 %8$7nJ#}>CC$,	%
 	%
 	%
!\ "))*j)LLWWYYmmK00  	 LL((r#   )NNF)r   r   r   r   r   r3   r   r{   r|   r!   ry   r}   r~   s   @r$   r   r      s        GGB2 B B B B B B. /359)./) /)|/) t+/)  %|d2	/)
  $;/) 
u|U\D00	1/) /) /) /) /) /) /) /)r#   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )IdeficsVisionMLPc                    t                                                       || _        t          |j                 | _        t          j        |j        |j	                  | _
        t          j        |j	        |j                  | _        d S N)r2   r3   r'   r   
hidden_actactivation_fnr   r   r4   intermediate_sizefc1fc2rE   s     r$   r3   zIdeficsVisionMLP.__init__  sf    #F$569V/1IJJ9V5v7IJJr#   r   rK   c                     |                      |          }|                     |          }|                     |          }|S r   )r   r   r   )rF   r   s     r$   ry   zIdeficsVisionMLP.forward  s=    //**=99//r#   )r   r   r   r3   r   r{   ry   r}   r~   s   @r$   r   r     sc        K K K K KU\ el        r#   r   c                        e Zd Zdef fdZ	 ddej        dej        dedz  dee	         d	e
ej                 f
d
Z xZS )IdeficsVisionEncoderLayerr'   c                 D   t                                                       |j        | _        t	          |          | _        t          j        | j        |j                  | _	        t          |          | _        t          j        | j        |j                  | _        d S N)eps)r2   r3   r4   r5   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rE   s     r$   r3   z"IdeficsVisionEncoderLayer.__init__  s    +/77<F<QRRR#F++<F<QRRRr#   Fr   r   r   Nr   rK   c                     |}|                      |          } | j        d|||d|\  }}||z   }|}|                     |          }|                     |          }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r   r   r"   )r   r   r   r   )rF   r   r   r   r   residualr   outputss           r$   ry   z!IdeficsVisionEncoderLayer.forward%  s    " !((77&4dn '
')/'
 '
 	'
 '
#| !=0 ((77// =0 " 	'&Gr#   rz   )r   r   r   r   r3   r   r{   r|   r   r   r!   r   ry   r}   r~   s   @r$   r   r     s        S2 S S S S S S */	& &|& &  $;	&
 +,& 
u 	!& & & & & & & &r#   r   c                        e Zd ZdZdef fdZe	 	 	 	 ddej        dz  de	dz  de	dz  de	dz  d	e
e         d
eez  fd            Z xZS )IdeficsVisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`IdeficsVisionEncoderLayer`].

    Args:
        config: IdeficsVisionConfig
    r'   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S r"   )r   ).0_r'   s     r$   
<listcomp>z1IdeficsVisionEncoder.__init__.<locals>.<listcomp>[  s"    $p$p$p1%>v%F%F$p$p$pr#   F)	r2   r3   r'   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingrE   s    `r$   r3   zIdeficsVisionEncoder.__init__X  sb    m$p$p$p$pPUV\VnPoPo$p$p$pqq&+###r#   Nr   r   output_hidden_statesreturn_dictr   rK   c                 :   ||n| j         j        }||n| j         j        }||n| j         j        }|rdnd}|rdnd}|}	t	          | j                  D ]/\  }
}|r||	fz   } ||	|fd|i|}|d         }	|r||d         fz   }0|r||	fz   }t          |	||          S )ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr"   r   r   r   )r   r   r   )r'   r   r   use_return_dict	enumerater   r	   )rF   inputs_embedsr   r   r   r   r   encoder_statesall_attentionsr   idxencoder_layerlayer_outputss                r$   ry   zIdeficsVisionEncoder.forward^  s.   @ 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]3=0:d%"+DK"8"8 	F 	FC# C!/=2B!B)M  #4 	 M *!,M  F!/=3C2E!E 	?+}.>>N+>Vd
 
 
 	
r#   )NNNN)r   r   r   r   r   r3   r   r   r{   r|   r   r   r!   r	   ry   r}   r~   s   @r$   r   r   O  s         ,2 , , , , , ,  /3)-,0#'=
 =
 t+=
  $;	=

 #Tk=
 D[=
 +,=
 
	 =
 =
 =
 =
 =
 =
 =
 =
r#   r   c                   ~     e Zd Zdef fdZ	 	 	 	 	 ddej        dz  dedz  dedz  dedz  d	edz  d
ee	z  fdZ
 xZS )IdeficsVisionTransformerr'   c                 4   t                                                       || _        |j        }t	          |          | _        t          j        ||j                  | _	        t          |          | _        t          j        ||j                  | _        d S r   )r2   r3   r'   r4   r&   rH   r   r   r   pre_layrnormr   encoderpost_layernorm)rF   r'   r5   rG   s      r$   r3   z!IdeficsVisionTransformer.__init__  s    &	1&99L8MNNN+F33 l9&:OPPPr#   NFro   r   r   rn   r   rK   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |                     ||          }|                     |          }|                     ||||          }|d         }|dddddf         }	|                     |	          }	|s||	f|dd         z   S t          ||	|j
        |j                  S )z
        Returns:

        Nz You have to specify pixel_values)rn   )r   r   r   r   r   r   )r   pooler_outputr   r   )r'   r   r   r   rc   rH   r   r   r   r
   r   r   )
rF   ro   r   r   rn   r   r   encoder_outputsr   pooled_outputs
             r$   ry   z IdeficsVisionTransformer.forward  s/    2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]?@@@Oghh))-88,,'/!5#	 ' 
 
 ,A.)!!!Q'2++M:: 	L%}58KKK)/')7&1	
 
 
 	
r#   )NNNFN)r   r   r   r   r3   r   r   r|   r!   r
   ry   r}   r~   s   @r$   r   r     s        Q2 Q Q Q Q Q Q 26)-,005#'+
 +
'$.+
  $;+
 #Tk	+

 #'++
 D[+
 
+	++
 +
 +
 +
 +
 +
 +
 +
r#   r   )r   )*r   rV   collections.abcr   dataclassesr   r   r   activationsr   modeling_layersr   modeling_outputsr	   r
   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.genericr   configuration_ideficsr   
get_loggerr   r]   r   Moduler&   r{   r`   r   r   r   r   r   r   r"   r#   r$   <module>r      s   [ Z  $ $ $ $ $ $ ! ! ! ! ! !        ! ! ! ! ! ! 9 9 9 9 9 9 K K K K K K K K 5 5 5 5 5 5 & & & & & &            : 9 9 9 9 9 6 6 6 6 6 6 
	H	%	% < < < < <{ < < <:` ` ` ` `bi ` ` `V % %I%<% 
% <	%
 L4'% % % % % %.F) F) F) F) F)RY F) F) F)T    ry    / / / / / : / / /fM
 M
 M
 M
 M
29 M
 M
 M
b7
 7
 7
 7
 7
ry 7
 7
 7
 7
 7
r#   