
    bi                     N   d dl Z ddlmZmZ ddlmZ  ej        e          Z e            Z	de j
        de j        j        de j        fdZ	 	 	 	 	 dde j        j        de j
        d
e j
        de j
        de j
        dz  dededz  dedz  dedz  dedz  dee j
        df         fdZdS )    N   )_flash_attention_forward!flash_attn_supports_top_left_mask)loggingquerymodulereturnc                 ,   | j         t          j        k    r~t          j        d          rt          j        d          S t          |j        d          r|j        j         S t          d |                                D                       j	        j         S dS )ziIf the query is in float32, return a target dtype compatible with flash attention. Return None otherwise.cuda_is_quantizedc              3   X   K   | ]%}t          |t          j        j                  !|V  &d S )N)
isinstancetorchnnLinear).0layers     d/root/projects/butler/venv/lib/python3.11/site-packages/transformers/integrations/flash_attention.py	<genexpr>z#get_target_dtype.<locals>.<genexpr>   s8      bb%z%QVQYQ`?a?abbbbbbb    N)
dtyper   float32is_autocast_enabledget_autocast_dtypehasattrconfignextmodulesweight)r   r   s     r   get_target_dtyper       s    {em##$V,, 	p+F333V]O44 	p=&&bb6>>+;+;bbbbbioo4r           keyvalueattention_maskdropoutscalingsliding_windowsoftcap	is_causalc
                    |
                     dd          rt                              d           |j        d         }t	          d |j        D                       rt          d          |                    dd          }|                    dd          }|                    dd          }t          ||           }|	|	n| j        }	t          ||||f||	||||t          || j        j        t          | d          r| j        nd d	
|
}|d fS )
Noutput_attentionsFzFlash Attention does not support `output_attentions=True`. Please set your attention to `eager` if you want any of these features.r   c              3   "   K   | ]
}|d k    V  dS )r   N )r   dims     r   r   z*flash_attention_forward.<locals>.<genexpr>/   s&      
+
+3!8
+
+
+
+
+
+r   zTensor query has shape  with a zero dimension.
FlashAttention does not support inputs with dim=0.
Please check your input shapes or use SDPA instead.   	layer_idx)
query_lengthr)   r%   softmax_scaler'   r(   use_top_left_masktarget_dtypeattn_implementationr0   )getloggerwarning_onceshapeany
ValueError	transposer    r)   r   _use_top_left_maskr   _attn_implementationr   r0   )r   r   r"   r#   r$   r%   r&   r'   r(   r)   kwargsseq_lenr4   attn_outputs                 r   flash_attention_forwardrB      sV    zz%u-- 
W	
 	
 	
 k!nG

+
+u{
+
+
+++ 
B
 
 	
 OOAq!!E
--1

COOAq!!E $E622L '2		8HI*	
 %,!"M>&-fk&B&BL&""   K$ r   )r!   NNNN)r   modeling_flash_attention_utilsr   r   utilsr   
get_logger__name__r7   r=   Tensorr   Moduler   r    floatintbooltuplerB   r-   r   r   <module>rM      sW    h h h h h h h h       
	H	%	%6688 
EL 
%(/ 
ek 
 
 
 
&  !% != =HO=<= 
= <	=
 L4'= = T\= $J= T\= d{= 5<= = = = = =r   