
    Mܶiu'                     l   d Z ddlZddlmZ ddlmZmZ ddlZ ej        e	          Z
ddgZ ed          d	ed
efd            Z G d de          Zej                            di           	 d,dej        dej        dej        dej        dej        dededed
eej        ej        ej        f         fd            Zej        	 d,dej        dej        dej        dej        dej        dededed
eej        ej        ej        f         fd            Z	 	 d-dej        dej        dej        dej        dej        dededededz  d
ej        eej        ej        f         z  fdZdedeedf         ded
dfd Zej                            d!i           d"ej        dej        dej        dej        d#ej        d$ej        dej        dej        dededed%ej        d
eej        ej        ej        f         fd&            Zej        d"ej        dej        dej        dej        d#ej        d$ej        dej        dej        dededed%ej        d
eej        ej        ej        f         fd'            Zded"ej        d(ej        d)ej        d
eej        dz  df         f
d*Ze                    ee+           dS ).z
Variable-length attention implementation using Flash Attention.

This module provides a high-level Python interface for variable-length attention
that calls into the optimized Flash Attention kernels.
    N)	lru_cache)Any
NamedTuplevarlen_attn
AuxRequest   )maxsizedevice_indexreturnc                     dS )z;Cache device capability check to avoid repeated CUDA calls.F )r
   s    ^/root/projects/openclaw-proxy/venv/lib64/python3.11/site-packages/torch/nn/attention/varlen.py_should_use_cudnnr      s	     5    c                   "    e Zd ZU dZdZeed<   dS )r   z
    Request which auxiliary outputs to compute from varlen_attn.

    Each field is a boolean indicating whether that auxiliary output should be computed.
    FlseN)__name__
__module____qualname____doc__r   bool__annotations__r   r   r   r   r      s.           Cr   ztorch_attn::_varlen_attn)mutates_argsFquerykeyvaluecu_seq_qcu_seq_kmax_qmax_k	is_causalc                    | j         ot          | j        j                  }|rbt                              d           t          j        j        	                    | ||d||||dd|d          }	|	d         |	d         |	d         }}}
nNt                              d	           t          j        j        
                    | ||||||d|d

  
        \  }
}}}}t          j        dt          j        | j                  }|
||fS )z
    Private custom op for variable-length attention.

    This is the internal implementation. Users should use the public varlen_attn function instead.
    #Using cuDNN backend for varlen_attnNT        Fr         -Using Flash Attention backend for varlen_attn)return_debug_mask   dtypedevice)is_cudar   r-   indexloginfotorchopsaten_cudnn_attention_forward_flash_attention_forwardzerosuint64)r   r   r   r   r   r   r    r!   	use_cudnnresultoutputsoftmax_lse	rng_state_
rng_state_s                  r   _varlen_attnr@   $   s    " G"3EL4F"G"GI 
677788
 
 *0F1IvayY@AAA/4y~/V/V# 0W 0
 0
,Y1 EL  J ;
**r   c                 $   t          j        |           }|                     d          }	|                     d          }
t          j        |
|	ft           j        | j                  }t          j        dt           j        | j                  }|||fS )z
    Fake implementation for meta tensor computation and tracing.

    Based on the 3D varlen path from meta__flash_attention_forward:
    - query shape: (total, num_heads, head_dim)
    - logsumexp shape: (num_heads, total_q)
    r   r%   r+   r)   )r2   
empty_likesizeemptyfloatr-   r8   )r   r   r   r   r   r   r    r!   r;   total_q	num_heads	logsumexpr=   s                r   _varlen_attn_fakerI   ^   s    & e$$F jjmmG

1I	GEK  I DU\JJJI9i''r   
return_auxc	           
      ~    t           j        j                            | |||||||          \  }	}
}||j        r|	|
fS |	S )a9  
    Compute variable-length attention using Flash Attention.
    This function is similar to scaled_dot_product_attention but optimized for
    variable-length sequences using cumulative sequence position tensors.
    Args:
    - query (Tensor): Query tensor; shape :math:`(T_q, H, D)`
    - key (Tensor): Key tensor; shape :math:`(T_k, H, D)`
    - value (Tensor): Value tensor; shape :math:`(T_k, H, D)`
    - cu_seq_q (Tensor): Cumulative sequence positions for queries; shape :math:`(N+1,)`
    - cu_seq_k (Tensor): Cumulative sequence positions for keys/values; shape :math:`(N+1,)`
    - max_q (int): Maximum query sequence length in the batch.
    - max_k (int): Maximum key/value sequence length in the batch.
    - is_causal (bool, optional): If set to True, applies causal masking (default: False).
    - return_aux (Optional[AuxRequest]): If not None and ``return_aux.lse`` is True, also returns the logsumexp tensor.

    Shape legend:
    - :math:`N`: Batch size
    - :math:`T_q`: Total number of query tokens in the batch (sum of all query sequence lengths)
    - :math:`T_k`: Total number of key/value tokens in the batch (sum of all key/value sequence lengths)
    - :math:`H`: Number of attention heads
    - :math:`D`: Head dimension

    Returns:
    - Tensor: Output tensor from attention computation
    - If ``return_aux`` is not None and ``return_aux.lse`` is True, returns a tuple of Tensors:
    (output, lse), where lse is the logsumexp

    Example::

        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
        >>> batch_size, max_seq_len, embed_dim, num_heads = 2, 512, 1024, 16
        >>> head_dim = embed_dim // num_heads
        >>> seq_lengths = []
        >>> for _ in range(batch_size):
        ...     length = torch.randint(1, max_seq_len // 64 + 1, (1,)).item() * 64
        ...     seq_lengths.append(min(length, max_seq_len))
        >>> seq_lengths = torch.tensor(seq_lengths, device="cuda")
        >>> total_tokens = seq_lengths.sum().item()
        >>>
        >>> # Create packed query, key, value tensors
        >>> query = torch.randn(
        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
        ... )
        >>> key = torch.randn(
        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
        ... )
        >>> value = torch.randn(
        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
        ... )
        >>>
        >>> # Build cumulative sequence tensor
        >>> cu_seq = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
        >>> cu_seq[1:] = seq_lengths.cumsum(0)
        >>> max_len = seq_lengths.max().item()
        >>>
        >>> # Call varlen_attn
        >>> output = varlen_attn(
        ...     query, key, value, cu_seq, cu_seq, max_len, max_len, is_causal=False
        ... )
    )r2   r3   
torch_attnr@   r   )r   r   r   r   r   r   r    r!   rJ   outr   r>   s               r   r   r      sS    N )&33sE8XueY KCa *.CxJr   ctxinputs.r;   c           
          |\  }}}}}}}	}
|\  }}}|                      ||||||||           || _        |	| _        |
| _        d S N)save_for_backwardr   r    r!   )rN   rO   r;   r   r   r   r   r   r   r    r!   rM   r   r=   s                 r   _setup_contextrS      s_    EKBE3x5% Ci%eXxc9UUUCICICMMMr   z!torch_attn::_varlen_attn_backwardgrad_outrM   r   r=   c                    t          j        d|j                  }|j        ot	          |j        j                  }|rPt                              d           t           j        j	        
                    | |||||||||	d|
||          \  }}}nOt                              d           t           j        j	                            | |||||||||	d|
||          \  }}}|||fS )Nr   )r-   r#   r$   r'   )r2   rD   r-   r.   r   r/   r0   r1   r3   r4   _cudnn_attention_backward_flash_attention_backward)rT   r   r   r   rM   r   r   r   r   r    r!   r=   unusedr9   dqdkdvs                    r   _varlen_attn_backwardr\      s    [5<000FG"3EL4F"G"GI #
6777Y^==
 

B" 	@AAAY^==
 

B  r2:r   c                     t          j        |          }t          j        |          }t          j        |          }|||fS )zF
    Fake implementation for meta tensor computation and tracing.
    )r2   rB   )rT   r   r   r   rM   r   r   r   r   r    r!   r=   
grad_querygrad_key
grad_values                  r   _varlen_attn_backward_fakera     sA    & !%((J$$H!%((Jx++r   grad_lsegrad_rngc                     | j         \  }}}}}}	}
}| j        }| j        }| j        }t          j        j                            |||||	|
||||||          \  }}}|||d d d d d d f	S rQ   )saved_tensorsr   r    r!   r2   r3   rL   r\   )rN   rT   rb   rc   r   r   r   r   r   rM   r   r=   r   r    r!   rY   rZ   r[   s                     r   	_backwardrf   ,  s     BEAR>E3x3YIEIEI%;; JBB r2tT4tT99r   )setup_context)F)FN)r   logging	functoolsr   typingr   r   r2   	getLoggerr   r0   __all__intr   r   r   library	custom_opTensortupler@   register_fakerI   r   rS   r\   ra   rf   register_autogradr   r   r   <module>rt      s)           " " " " " " " "  g!!,
' 1C D    
        3"EE 6+ 6+<6+	6+ <6+ l	6+
 l6+ 6+ 6+ 6+ 5<u|346+ 6+ 6+ FE6+r  ( (<(	( <( l	(
 l( ( ( ( 5<u|34( ( ( (P $(L L<L	L <L l	L
 lL L L L T!L \E%,455L L L L^ U38_ c d     <2NN5l5<5 
5 <	5
 
5 
5 l5 l5 5 5 5 |5 5<u|345 5 5 ON5p $,l,<, 
, <	,
 
, 
, l, l, , , , |, 5<u|34, , , %$,2:	::05:HM:
5<$#$: : : :4   y  G G G G Gr   