
    bi                         d dl Z ddlmZ ddlmZ 	 	 	 	 	 	 dde j        j        de j        de j        de j        d	e j        dz  d
ede j        fdZdS )    N   )PagedAttentionCache)!lazy_import_paged_flash_attentionmoduleqkvattention_maskcachereturnc
           	      .   t          | j        j                  }t          | dd          sdn| j        dz
  df}|dk    rdnd}|.|                    ||| j        |
d	         |
d
                   \  }}t          |t                    r||         }|	|         }	d|
v rd|
	                    d          ini } ||
                    dd                              d                                          |                                |                                |                    t          j                  |                    t          j                                                  ||	f| j        d|d|}t          |t$                    r|d         }|dfS )a  Perform the forward pass of attention with paged key-value cache.

    This function handles the cache updates and performs the attention computation
    using the flash_attn_varlen_func for efficient processing.

    Args:
        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
        k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full k
        v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full v
        cu_seq_lens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into q.
        cu_seq_lens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into kv.
        max_seqlen_q: int. Maximum query sequence length in the batch.
        max_seqlen_k: int. Maximum key sequence length in the batch.
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        softcap: float. Anything > 0 activates softcapping attention.
    sliding_windowF)r      r   full_attentionsliding_attentionN
read_indexwrite_index)
key_statesvalue_states	layer_idxr   r   s_auxr   T)softmax_scalecausalwindow_size)r   config_attn_implementationgetattrr   updater   
isinstancedictget	transposesqueeze
contiguoustotorchint32clonescalingtuple)r   r   r   r	   r
   r   cu_seq_lens_qcu_seq_lens_kmax_seqlen_qmax_seqlen_kkwargsflash_attn_varlen_funcr   
layer_typecustom_kwargsattn_outputs                   `/root/projects/butler/venv/lib/python3.11/site-packages/transformers/integrations/flash_paged.pypaged_attention_forwardr6      s   F ?v}?abb%,V5Eu%M%MqXXTZTilmTmopSqN%3x%?%?!!EXJ ||&l+}-  
 
1 -&& 0%j1#J/6=6G6GWfjj1122RM((	Aq!!!$$//11		%%%%++-- n"   K +u%% %!!n    )NNNNNN)	r'   generation.continuous_batchingr   modeling_flash_attention_utilsr   nnModuleTensorr6    r7   r5   <module>r>      s     @ @ @ @ @ @ N N N N N N +/!%H HHOH|H |H |	H
 L4'H H \H H H H H Hr7   