
    bi$                         d Z ddlZddlmZ ddlmZ  G d dej                  Z G d dej                  Z G d	 d
ej                  Z	dS )a  

Generic interface to various configurations of the Perceiver Resampler, that simply takes in a series of (potentially
time-indexed) contextual embeddings, and "resamples" (compresses) them down to a pre-specified number of latents! Note
that the Perceiver in general resamples based solely off the *long-range* context; there's a nice opportunity here to
prime the Perceiver Resampler with say a single layer's worth of language embeddings (the target domain), and use that
to softly "retrieve & compress" what we need --> this would be a novel contribution we should explore.

References:
    - DeepMind's Flamingo: https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model
    - Code borrowed w/ love from: https://github.com/lucidrains/flamingo-pytorch

    N   )IdeficsConfigc                   `     e Zd Zdededededededdf fd	Zd
ej        dej        fdZ xZ	S )IdeficsPerceiverResamplerconfig	embed_dimdepthn_headshead_dim	n_latentsreturnNc                 $    t                                                       ||||f\   _         _         _         _        j        j         _        t          j
        t          j         j         j                  d           _        t          j        d          s
 j        dz  nj        j        dz   _        t          j         fdt%          |          D                        _        t          j         j                   _        dS )ao  
        Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or
        MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then
        returns a Tensor of shape [bsz, n_latents, embed_dim]. :param embed_dim: Dimensionality of embeddings being fed
        to the Perceiver Resampler (also dimensionality of latent embeddings *returned* by the Perceiver Resampler.
        Could be e.g., VIT embed_dim, ResNet pool dim, and so on.

        Args:
            config (`IdeficsConfig`): config object
            embed_dim (`int`): The size of each embedding vector
            depth (`int`): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3).
            n_heads (`int`): Number of heads in each Transformer block (for multi-headed self-attention).
            head_dim (`int`): Dimensionality of each head projection in the Transformer block.
            n_latents (`int`):
                Number of latent embeddings to resample ("compress") the input sequence to (usually < 128).

        T)requires_gradr      c           
          g | ]O}t          j        t          j        j        j        j                  t          j                  g          PS  )	nn
ModuleListIdeficsPerceiverAttentionr   r
   r   qk_layer_norms
IdeficsMLPintermediate_dim).0_r   selfs     `/root/projects/butler/venv/lib/python3.11/site-packages/transformers/models/idefics/perceiver.py
<listcomp>z6IdeficsPerceiverResampler.__init__.<locals>.<listcomp>Q   si         1$.$,PTP]_c_rss"4#8&AA       N)super__init__r   r
   r   r   perceiver_configqk_layer_norms_perceiverr   r   	Parametertorchrandnlatentshasattrvision_configr   r   rangeblocks	LayerNorm
layer_norm)r   r   r   r	   r
   r   r   	__class__s   ``     r   r    z"IdeficsPerceiverResampler.__init__/   s   ( 	FOQXZbdmFmCdmT^$5N |EK$O$O_cddd 6/==4DNQ%/!3 	 m     u  

 

 ,t~66r   contextc                     | j                             |j        d         dd          }| j        D ]"\  }} |||          |z   } ||          |z   }#|                     |          S )zWResample arbitrary length context & *compress* down to self.n_latents latent embeddingsr   r   )r&   repeatshaper*   r,   )r   r.   r&   attnffs        r   forwardz!IdeficsPerceiverResampler.forward]   sw     ,%%gmA&61==  	, 	,HD"d7G,,w6GbkkG+GGw'''r   )
__name__
__module____qualname__r   intr    r$   Tensorr4   __classcell__r-   s   @r   r   r   .   s        ,7#,703,7<?,7JM,7Y\,7il,7	,7 ,7 ,7 ,7 ,7 ,7\
(u| 
( 
( 
( 
( 
( 
( 
( 
( 
(r   r   c            
       f     e Zd Zdededededdf
 fdZdej        d	ej        dej        fd
Z xZ	S )r   r   r
   r   r   r   Nc                    t                                                       |||c| _        | _        | _        || _        t          j        | j                  | _        t          j        | j                  | _	        | j        r<t          j        | j                  | _
        t          j        | j                  | _        | j        dz  | _        t          j        | j        | j        | j        z  d          | _        t          j        | j        | j        | j        z  d          | _        t          j        | j        | j        | j        z  d          | _        t          j        | j        | j        z  |d          | _        dS )ziPerceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`g      FbiasN)r   r    r   r
   r   r   r   r+   context_layer_normlatents_layer_normq_layer_normk_layer_normqk_scaleLinearq_projk_projv_projoutput_proj)r   r   r
   r   r   r-   s        r   r    z"IdeficsPerceiverAttention.__init__k   s7   6?(3dm,"$,t~">">"$,t~">"> 	< "T] ; ;D "T] ; ;Dt+ it}0LSXYYYit}0LSXYYYit}0LSXYYY9T\DM%A9SXYYYr   r.   r&   c                 H                          |          }                     |          }|j        dd         \  }}                     |          }                     t          j        ||gd                    }                     t          j        ||gd                    } fd|||fD             \  }}} j        r* 	                    |          } 
                    |          }t          j        d| j        z  |          }||                    dd	                                          z
  }	|	                    d          }
t          j        d
|
|          }                     |                    dd                              d                    S )aF  
        Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!

        Args:
            context (`torch.Tensor`):
                Tensor of shape `[bsz, seq, embed_dim]` representing long-form context to resample.
            latents (`torch.Tensor`):
                Tensor of shape `[bsz, n_latents, embed_dim]` representing fixed length latents to compress to.

        Returns:
            `torch.Tensor`: Tensor of shape `[bsz, n_latents, embed_dim]` representing attention over latents w/ cross
            from context.
        N   )dimc                     g | ]C}|                     |j        d          j        j                                      d d          DS )r      )reshaper1   r
   r   	transpose)r   x
batch_sizer   s     r   r   z5IdeficsPerceiverAttention.forward.<locals>.<listcomp>   sH    uuufg199ZT\4=QQ[[\]_`aauuur   z... i d, ... j d -> ... i jT)rM   keepdimz... i j, ... j d -> ... i dr   rO   )r@   rA   r1   rF   rG   r$   catrH   r   rB   rC   einsumrD   amaxdetachsoftmaxrI   rQ   flatten)r   r.   r&   
seq_lengthr   qkvscoresstabilized_scoresr2   	resampledrS   s   `           @r   r4   z!IdeficsPerceiverAttention.forward   s    ))'22))'22,3M"1",=)
J	 KK  KK	7G"4"===>>KK	7G"4"===>>
 vuuuulmoprsktuuu1a 	%!!!$$A!!!$$A;Q=NPQRR"fkkb$k&G&G&N&N&P&PQ ((R(00 L!>aHH		 3 3Aq 9 9 A A" E EFFFr   )
r5   r6   r7   r8   boolr    r$   r9   r4   r:   r;   s   @r   r   r   j   s        Z# Z Zs ZTX Z]a Z Z Z Z Z Z*(Gu| (Gel (Gu| (G (G (G (G (G (G (G (Gr   r   c                   Z     e Zd Zdef fdZdeej                 dz  dej        fdZ xZ	S )r   r   c                 Z   t                                                       |j        j        | _        t	          j        | j                  | _        t	          j        | j        |d          | _        t	          j	                    | _
        t	          j        || j        d          | _        dS )z:Simple MLP block with intermediate_size and embedding sizeFr>   N)r   r    r(   r   r   r+   lnrE   fcReLUactc_proj)r   intermediate_sizer   r-   s      r   r    zIdeficsMLP.__init__   s    -7,t~..)DN,=EJJJ799i 14>NNNr   hidden_statesNr   c                     |                      |          }|                     |          }|                     |          }|                     |          }|S )N)rf   rg   ri   rj   )r   rl   s     r   r4   zIdeficsMLP.forward   sL    ....//M22r   )
r5   r6   r7   r   r    tupler$   FloatTensorr4   r:   r;   s   @r   r   r      s}        O- O O O O O OU5+<%=%D IZ        r   r   )
__doc__r$   torch.nnr   configuration_ideficsr   Moduler   r   r   r   r   r   <module>rt      s   4         0 0 0 0 0 09( 9( 9( 9( 9(	 9( 9( 9(x>G >G >G >G >G	 >G >G >GB         r   