
    bi                         d dl Z d dl mZ ddlmZ de j        dede j        fdZd	ej        d
e j        de j        de j        de j        dz  defdZ	dS )    N)nn   )PagedAttentionCachehidden_statesn_repreturnc                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
       N)shapeexpandreshape)r   r   batchnum_key_value_headsslenhead_dims         `/root/projects/butler/venv/lib/python3.11/site-packages/transformers/integrations/eager_paged.py	repeat_kvr      s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTT    modulequerykeyvalueattention_maskscalingc                 R   |                     dd           }||                    ||| j        |d         |d                   \  }}|                    dd                              d          }|                    dd                              d          }t          | d          r*t          || j                  }t          || j                  }t          |t                    r&t          | dd          }|dk    s|d	nd
}	||	         }
n|}
t          j        ||                    dd                    |z  }|
||
z   }t          | d          r| j                            dddd                              |j        d         d|j        d         d          }t          j        ||gd          }||                    dd          j        z
  }t(          j                            |dt          j                                      |j                  }|dd df         }nDt(          j                            |dt          j                                      |j                  }t          j        ||          }|                    dd                                          }||fS )Ncache
read_indexwrite_index)
key_statesvalue_states	layer_idxr   r   r   r
   num_key_value_groupssliding_windowfull_attentionsliding_attentionr      sinks)dimT)r*   keepdim)r*   dtype.)popupdater!   	transpose	unsqueezehasattrr   r"   
isinstancedictgetattrtorchmatmulr'   r   r   r   catmaxvaluesr   
functionalsoftmaxfloat32tor,   
contiguous)r   r   r   r   r   r   kwargsr   r#   
layer_typecausal_maskattn_weightsr'   attn_outputs                 r   eager_paged_attention_forwardrD      s    )/

7D(A(AE\\&l+}- " 
 

U mmAq!!++A..1%%//22 v-.. >V899%!<== .$'' % )91==)71)<)<@V%%\o
$Z0$<s}}Q':':;;gEL#k1 vw 
h$$QAq1188QU[Y[_^`aay,!6B???#l&6&62t&6&L&L&SS},,\r,WWZZ[`[fgg#C"H-},,\r,WWZZ[`[fgg,|U33K''1--88::K$$r   )
r5   r   $generation.continuous_batching.cacher   Tensorintr   ModulefloatrD    r   r   <module>rK      s           F F F F F F	UU\ 	U# 	U%, 	U 	U 	U 	U8%I8%<8% 
8% <	8%
 L4'8% 8% 8% 8% 8% 8% 8%r   