
    bi4                        d Z ddlmZ ddlZddlmZ ddlmZmZ ddl	m
Z
mZmZ  e            rddlmZ dd	lmZmZmZ  ej        e          Z G d
 d          Z	 d"dej        dej        dej        dej        eej        ej        f         z  fdZej        ez  Z	 	 	 	 	 d#dej        dedz  deeef         dz  dedz  ddf
dZdej        dedej        fdZ	 	 	 d$dej        j         dej        dej        dej        deej        df         de!dz  de!dz  d ej        dz  deej        ej        dz  f         fd!Z"dS )%a7  
Partially inspired by torchtune's flex attention implementation

Citation:
@software{torchtune,
  title = {torchtune: PyTorch's finetuning library},
  author = {torchtune maintainers and contributors},
  url = {https//github.com/pytorch/torchtune},
  license = {BSD-3-Clause},
  month = apr,
  year = {2024}
}
    )UnionN)version   )is_torch_flex_attn_availablelogging)get_torch_versionis_torch_less_or_equalis_torchdynamo_compiling)_DEFAULT_SPARSE_BLOCK_SIZE)	BlockMaskcreate_block_maskflex_attentionc                   |     e Zd ZdZdZdZdZ fdZej	        
                    d          d             Zd Z xZS )WrappedFlexAttentionzh
    We are doing a singleton class so that flex attention is compiled once when it's first called.
    NFc                 l    | j         &t                                          |           | _         | j         S N)	_instancesuper__new__)clsargskwargs	__class__s      c/root/projects/butler/venv/lib/python3.11/site-packages/transformers/integrations/flex_attention.pyr   zWrappedFlexAttention.__new__7   s*    = !GGOOC00CM}    )	recursivec                    | j         r|| j        k    r|| _        t          d          r!t          j        t
          d          | _        nkt          j        t                                j
        dk    r$|r"t          j        t
          dd          | _        nt          j        t
                    | _        d| _         dS dS )	z>
        Initialize or update the singleton instance.
        2.5.1F)dynamicz2.6.0zmax-autotune-no-cudagraphs)r   modeTN)_is_flex_compiledtrainingr	   torchcompiler   _compiled_flex_attentionr   parser   base_version)selfr"   s     r   __init__zWrappedFlexAttention.__init__=   s    
 % 	*T])B)B$DM%g.. N05nV[0\0\0\-- 02233@GKKPXK05"E8T1 1 1--
 16n0M0M-%)D""" *C)Br   c                     | j         S r   )r%   )r(   s    r   __call__zWrappedFlexAttention.__call__S   s    ,,r   )__name__
__module____qualname____doc__r   r!   r%   r   r#   compilerdisabler)   r+   __classcell__)r   s   @r   r   r   .   s          I#     ^e,,* * -,**- - - - - - -r   r   Fquerykeyvaluereturnc                 p    t                      s t          |                      nt          } || ||fi |S r   )r
   r   r   )r3   r4   r5   r"   r   flex_attention_compileds         r   compile_friendly_flex_attentionr9   W   s\     G_F`F`t<28<<>>>ft""  	  r   Tattention_mask_2dattention_chunk_sizeoffsets	is_causalr   c                 v     j         \  }}|s|}|s|}|t          z  dz   t          z  }t          j        j                             dd||z
  f            j        }	                                 |@                                                    d          	                    d          dz
  |z   fdfd}
 fd}|s|n|n|
|>|d         
                    |	          |d         
                    |	          fd	}n}t          ||d|||	t          d
                     S )aG  
    IMPORTANT NOTICE: This function is deprecated in favor of using the mask primitives in `masking_utils.py`,
    and will be removed in a future version without warnings. New code should not use it. It is only kept here
    for BC for now, while models using it are being patched accordingly.

    Create a block (causal) document mask for a batch of sequences, both packed and unpacked.
    Create Block (causal) logic and passing it into :func:`torch.nn.attention.flex_attention.create_block_mask`.
    The resultant BlockMask is a compressed representation of the full (causal) block
    mask. BlockMask is essential for performant computation of flex attention.
    See: https://pytorch.org/blog/flexattention/

    Args:
        attention_mask_2d (torch.Tensor): Attention mask for packed and padded sequences
        of shape (batch_size, total_seq_len). e.g.

        For unpacked sequence:
        [[1, 1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 0, 0]]

        For packed sequence:
        [[1, 1, 1, 2, 2, 2, 0],
         [1, 1, 2, 2, 2, 3, 3]]

    Returns:
        BlockMask
       r   )r5   padNc                 l    ||k    }	| |f         	| |f         k    }| |f         dk    }||z  |z  }|S )z
        Defines the logic of a block causal mask by combining both a standard causal mask
        and a block diagonal document mask.
        See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
        for an illustration.
        r    )
	batch_idxhead_idxq_idxkv_idxcausal_maskdocument_maskpadding_mask
final_maskr:   document_idss
           r   causal_mask_modz4make_flex_block_causal_mask.<locals>.causal_mask_mod   sV     vo$Y%56,yRXGX:YY(E)9:Q> </-?
r   c                 V    | |f         | |f         k    } | |||          }||z  S )zU
        Combines the chunk mask with the causal mask for chunked attention.
        rC   )rD   rE   rF   rG   
chunk_maskcausal_doc_maskrM   
chunk_idxss         r   chunk_causal_mask_modz:make_flex_block_causal_mask.<locals>.chunk_causal_mask_mod   sC      	5 01Z	6@Q5RR
)/)XufMMO++r   c                 Z    | |f         | |f         k    }| |f         dk    }||z  }|S )zp
        Utilizes default attention mask to enable encoder and encoder-decoder
        attention masks.
        r   rC   )	rD   rE   rF   rG   rI   rJ   rK   r:   rL   s	          r   default_mask_modz5make_flex_block_causal_mask.<locals>.default_mask_mod   sH    
 %Y%56,yRXGX:YY(F):;a?!M1
r   c                 4    |z   }|z   } | |||          S r   rC   )	rD   rE   rF   rG   offset_q	offset_kv	kv_offsetmask_mod_maybe_combinedq_offsets	         r   mask_modz-make_flex_block_causal_mask.<locals>.mask_mod   s.    x'H*I**9h)TTTr   r   )r[   BHQ_LENKV_LENdevice_compile)shapeflex_default_block_sizer#   nn
functionalr@   r`   clonefill_cumsumtor   r	   )r:   r;   query_length
key_lengthr<   r=   
batch_sizetotal_seq_lenpad_lenr`   rR   rT   r[   rM   rQ   rL   rX   rY   rZ   s   `            @@@@@@r   make_flex_block_causal_maskro   m   s   D !2 7J #"
 %$55:>UUG+//0AQRT[^hThPi/jj%F$**,,L'"((**0033::2>>BH\]
     , , , , , ,	 	 	 	 	 	  m"25I5Q//Wl1:==((AJMM&))		U 	U 	U 	U 	U 	U 	U 	U
 +

+G444	 	 	 	r   hidden_statesn_repc                     | j         \  }}}}|dk    r| S | dddddddddf                             |||||          } |                     |||z  ||          S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r?   N)rb   expandreshape)rp   rq   batchnum_key_value_headsslenhead_dims         r   	repeat_kvry      s    
 2?1D.Ehzz!!!!QQQaaa"23::5BUW\^bdlmmM  (;e(CT8TTTr   moduleattention_maskscalingsoftcaps_auxc                 D   |                     dd          dk    rt          d          d }	d t          |t                    r|}	n|d d d d d d d |j        d         f         fd}
d}|j        d         }||dz
  z  dk    rTt          ||j        d         |j        d         z            }t          ||j        d         |j        d         z            }d	}|                     d
          }|j        j        dk    }|s|t          d          t          ||||
|	||||| j	        
  
        }|r|\  }}|
                    |j                  }||j        \  }}}}|                    dddd                              |||d          }|                    d          }t          j        t          j        ||gd          dd          }t          j        ||z
            }||z  }n|}d }|                    dd                                          }||fS )Ndropoutg        r   z`flex_attention` does not support `dropout`. Please use it with inference only (`model.eval()`) or turn off the attention dropout in the respective config.c                 ~    t          j        | z            z  } | |         d         |         |         z   } | S )Nr   )r#   tanh)scorerD   rE   rF   rG   
score_maskr}   s        r   	score_modz)flex_attention_forward.<locals>.score_mod  sL    ej999E!Jy1!4U;FCCE r   Tr?   Fkernel_optionscpuzhAttention sinks cannot be run on CPU with flex attention. Please switch to a different device, e.g. CUDA)r   
block_mask
enable_gqascaler   
return_lser"   rA   )dim)r   keepdimr   )get
ValueError
isinstancer   rb   ry   r`   typer9   r"   ri   dtypeviewrs   	unsqueezer#   	logsumexpcatexp	transpose
contiguous)rz   r3   r4   r5   r{   r|   r}   r~   r   r   r   r   num_local_query_headsr   r   flex_attention_outputattention_outputlserl   	num_heads	seq_len_q_sinkslse_expandedcombined_lserenorm_factorr   s         `                   @r   flex_attention_forwardr      s    zz)S!!A%%a
 
 	

 JJ.),, $#

#
111aaa39R= 89
      J!KN 	!6!:;AAU[^sy|;<<%Q5;q>!ABB
ZZ 011N"e+J 
%+v
 
 	
 <%      5#ffU[!!2B2H/J	9aJJq"a++22:y)UVWWE
 ==,,L ?59lE5JPR+S+S+SY[eijjjL "Il\&ABBM/-?0'11!Q77BBDDS  r   )F)NNNNT)NNN)#r/   typingr   r#   	packagingr   utilsr   r   utils.import_utilsr   r	   r
   !torch.nn.attention.flex_attentionr   rc   r   r   r   
get_loggerr,   loggerr   Tensortupler9   intOffsetboolro   ry   rd   Modulefloatr   rC   r   r   <module>r      s   8              9 9 9 9 9 9 9 9 d d d d d d d d d d  !! _gggggg^^^^^^^^^^ 
	H	%	%&- &- &- &- &- &- &- &-Z 	 <	 < \E%,455   $ 
	 (,,0!o o|o*o
 66>"T)o d{o o o o od	UU\ 	U# 	U%, 	U 	U 	U 	U$ ! !%]! ]!HO]!<]! 
]! <	]!
 %,34]! T\]! T\]! <$]! 5<,,-]! ]! ]! ]! ]! ]!r   