
    Sܶi-                     L   d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	m
Z
 d dlZd dlZd dlmc mZ d dlmZmZ ddlmZ ddlmZ dd	lmZ 	 d d
lmZ dZn# eeef$ r dZdZY nw xY we G d d                      Z G d dej                   Z  G d dej!                  Z! G d dej"                  Z"d"dZ#ed             Z$ G d dej%                  Z& G d dej%                  Z' G d dej%                  Z( G d dej%                  Z) G d  d!ej%                  Z*dS )#    N)contextmanager)	dataclass)DictIterableOptionalTuple)Tensornn   )decode)detect_language)
transcribe)scaled_dot_product_attentionTFc                   t    e Zd ZU eed<   eed<   eed<   eed<   eed<   eed<   eed<   eed<   eed	<   eed
<   dS )ModelDimensionsn_melsn_audio_ctxn_audio_staten_audio_headn_audio_layern_vocab
n_text_ctxn_text_staten_text_headn_text_layerN)__name__
__module____qualname__int__annotations__     P/root/projects/openclaw-proxy/venv/lib/python3.11/site-packages/whisper/model.pyr   r      s         KKKLLLOOOr"   r   c                   (     e Zd Zdedef fdZ xZS )	LayerNormxreturnc                     t                                          |                                                              |j                  S N)superforwardfloattypedtype)selfr&   	__class__s     r#   r+   zLayerNorm.forward(   s1    wwqwwyy))..qw777r"   )r   r   r   r	   r+   __classcell__r0   s   @r#   r%   r%   '   sK        8 8F 8 8 8 8 8 8 8 8 8 8r"   r%   c                       e Zd ZdedefdZdS )Linearr&   r'   c                     t          j        || j                            |j                  | j        d n| j                            |j                            S r)   )Flinearweighttor.   bias)r/   r&   s     r#   r+   zLinear.forward-   sJ    xKNN17##I%DD49<<+@+@
 
 	
r"   N)r   r   r   r	   r+   r!   r"   r#   r4   r4   ,   s6        
 
F 
 
 
 
 
 
r"   r4   c                   <     e Zd Zdededee         def fdZ xZS )Conv1dr&   r8   r:   r'   c                     t                                          ||                    |j                  |d n|                    |j                            S r)   )r*   _conv_forwardr9   r.   )r/   r&   r8   r:   r0   s       r#   r>   zConv1d._conv_forward6   sL     ww$$vyy!!4<44TWWQW=M=M
 
 	
r"   )r   r   r   r	   r   r>   r1   r2   s   @r#   r<   r<   5   sb        

!'
/7/?
	
 
 
 
 
 
 
 
 
 
r"   r<   '  c                    |dz  dk    sJ t          j        |          |dz  dz
  z  }t          j        | t          j        |dz            z            }t          j        |           ddt           j        f         |t           j        ddf         z  }t          j        t          j        |          t          j        |          gd          S )z*Returns sinusoids for positional embedding   r   r   Ndim)	nplogtorchexparangenewaxiscatsincos)lengthchannelsmax_timescalelog_timescale_incrementinv_timescalesscaled_times         r#   	sinusoidsrS   >   s    a<1 f]33x1}q7HIY 77%,xST}:U:UUVVN,v&&qqq"*}5rzSTSTST}8UUK9ei,,ei.D.DE1MMMMr"   c               #   ~   K   t           j        } 	 dt           _        d V  | t           _        d S # | t           _        w xY w)NF)MultiHeadAttentionuse_sdpa)
prev_states    r#   disable_sdparX   G   sI      #,J1&+#&0###j#0000s   . <c                        e Zd ZdZdedef fdZ	 	 	 ddedee         dee         d	ee         fd
Z		 ddedededee         de
ej        eej                 f         f
dZ xZS )rU   Tn_staten_headc                    t                                                       || _        t          ||          | _        t          ||d          | _        t          ||          | _        t          ||          | _        d S )NF)r:   )r*   __init__r[   r4   querykeyvalueout)r/   rZ   r[   r0   s      r#   r]   zMultiHeadAttention.__init__T   sm    GW--
'7777GW--
'7++r"   Nr&   xamaskkv_cachec                 D   |                      |          }||	| j        |vr3|                     ||n|          }|                     ||n|          }n|| j                 }|| j                 }|                     ||||          \  }}	|                     |          |	fS r)   )r^   r_   r`   qkv_attentionra   )
r/   r&   rb   rc   rd   qkvwvqks
             r#   r+   zMultiHeadAttention.forward\   s     JJqMMrzTXX-E-E bjb11A


1133AA "A$A##Aq!T22Bxx||Rr"   rg   rh   ri   r'   c                    |j         \  }}}|| j        z  dz  } |j        g |j         d d         | j        dR                      dddd          } |j        g |j         d d         | j        dR                      dddd          } |j        g |j         d d         | j        dR                      dddd          }t          rVt
          j        rJt          ||||d uo|dk              }	|	                    dddd                              d          }
d }n||z  ||z  	                    dd	          z  }|||d |d |f         z   }|
                                }t          j        |d
                              |j                  }||z                      dddd                              d          }
|                                }|
|fS )Ng      пrA   r   r      )	is_causal)	start_dimrB   )shaper[   viewpermuteSDPA_AVAILABLErU   rV   r   flatten	transposer,   r6   softmaxr9   r.   detach)r/   rg   rh   ri   rc   n_batchn_ctxrZ   scaleara   rk   ws                r#   rf   z MultiHeadAttention.qkv_attentionr   s    #$'DK'E1AF1AGBQBK11b11199!Q1EEAF1AGBQBK11b11199!Q1EEAF1AGBQBK11b11199!Q1EE 	09 	,1a4t#3#A	  A ))Aq!Q''//!/<<CBBe)E	44R<<<B$vvvv~..B	""%%%((11Aq5//!Q1--555BBCBBwr"   NNNr)   )r   r   r   rV   r   r]   r	   r   dictr+   r   rF   rf   r1   r2   s   @r#   rU   rU   Q   s       H, ,S , , , , , ,  $!%#'     V  v	 
 4.       . IM "'-5=f5E	u|Xel33	4       r"   rU   c            
       r     e Zd Zddededef fdZ	 	 	 ddedee         d	ee         d
ee         fdZ	 xZ
S )ResidualAttentionBlockFrZ   r[   cross_attentionc                    t                                                       t          ||          | _        t	          |          | _        |rt          ||          nd | _        |rt	          |          nd | _        |dz  }t          j	        t          ||          t          j                    t          ||                    | _        t	          |          | _        d S )N   )r*   r]   rU   attnr%   attn_ln
cross_attncross_attn_lnr
   
Sequentialr4   GELUmlpmlp_ln)r/   rZ   r[   r   n_mlpr0   s        r#   r]   zResidualAttentionBlock.__init__   s    &w77	 )) 4CLw/// 	 4CLYw///!=7E""BGIIveW/E/E
 
  ((r"   Nr&   rb   rc   rd   c                 :   ||                      |                     |          ||          d         z   }| j        r4||                     |                     |          ||          d         z   }||                     |                     |                    z   }|S )Nrc   rd   r   )rd   )r   r   r   r   r   r   )r/   r&   rb   rc   rd   s        r#   r+   zResidualAttentionBlock.forward   s     		$,,q//x	HHKK? 	UDOOD$6$6q$9$92OQQRSTTAQ(((r"   )Fr   )r   r   r   r   boolr]   r	   r   r   r+   r1   r2   s   @r#   r   r      s        ) ) )S )4 ) ) ) ) ) )(  $!%#'  V v	
 4.       r"   r   c            
       @     e Zd Zdededededef
 fdZdefdZ xZS )	AudioEncoderr   r{   rZ   r[   n_layerc                    t                                                       t          |dd          | _        t          ddd          | _        |                     dt          |                     t          j        fdt          |          D                       | _
        t                    | _        d S )Nrn   r   )kernel_sizepaddingrA   )r   strider   positional_embeddingc                 0    g | ]}t                    S r!   r   .0_r[   rZ   s     r#   
<listcomp>z)AudioEncoder.__init__.<locals>.<listcomp>   s$    MMM#GV44MMMr"   )r*   r]   r<   conv1conv2register_bufferrS   r
   
ModuleListrangeblocksr%   ln_post)r/   r   r{   rZ   r[   r   r0   s      `` r#   r]   zAudioEncoder.__init__   s     	FGAFFF
GW!AqQQQ
3Yug5N5NOOO8:MMMMMeGnnMMM9
 9
 !))r"   r&   c                    t          j        |                     |                    }t          j        |                     |                    }|                    ddd          }|j        dd         | j        j        k    s
J d            || j        z                       |j                  }| j	        D ]} ||          }| 
                    |          }|S )zt
        x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
            the mel spectrogram of the audio
        r   rA   r   Nzincorrect audio shape)r6   gelur   r   rt   rr   r   r9   r.   r   r   )r/   r&   blocks      r#   r+   zAudioEncoder.forward   s    
 F4::a==!!F4::a==!!IIaAwqrr{d7====?V===**..qw77[ 	 	EaAALLOOr"   )r   r   r   r   r]   r	   r+   r1   r2   s   @r#   r   r      s        **"%*03*=@*KN* * * * * *        r"   r   c            
       V     e Zd Zdededededef
 fdZdded	ed
ee         fdZ xZ	S )TextDecoderr   r{   rZ   r[   r   c                 "   t                                                       t          j        |          | _        t          j        t          j        |                    | _        t          j	        fdt          |          D                       | _        t                    | _        t          j        ||                              t          j                                       d          }|                     d|d           d S )Nc                 4    g | ]}t          d           S )T)r   r   r   s     r#   r   z(TextDecoder.__init__.<locals>.<listcomp>   s8        'wMMM  r"   r   rc   F
persistent)r*   r]   r
   	Embeddingtoken_embedding	ParameterrF   emptyr   r   r   r   r%   lnfill_rD   inftriu_r   )r/   r   r{   rZ   r[   r   rc   r0   s      ``  r#   r]   zTextDecoder.__init__   s     	!|GW==$&LUG1L1L$M$M!8:    w  9
 9
 G$${5%((..w77==a@@VTe<<<<<r"   Nr&   rb   rd   c                    |r9t          t          |                                                    j        d         nd}|                     |          | j        |||j        d         z            z   }|                    |j                  }| j        D ]} |||| j	        |          }| 
                    |          }|t          j        | j        j                            |j                  dd          z                                  }|S )z
        x : torch.LongTensor, shape = (batch_size, <= n_ctx)
            the text tokens
        xa : torch.Tensor, shape = (batch_size, n_audio_ctx, n_audio_state)
            the encoded audio features to be attended on
        r   r   rm   r   )nextitervaluesrr   r   r   r9   r.   r   rc   r   rF   rw   r8   r,   )r/   r&   rb   rd   offsetr   logitss          r#   r+   zTextDecoder.forward   s     <DJd8??,,--..4Q77  ##'!'"+1E(EFG 	
 DDNN[ 	@ 	@Ea$)h???AAGGAJJ 4 ; > >qw G GANNN
%'' 	 r"   r)   )
r   r   r   r   r]   r	   r   r   r+   r1   r2   s   @r#   r   r      s        ==#&=14=>A=LO= = = = = =&  V x~        r"   r   c                       e Zd Zdef fdZdefdZdej        fdZ	dej        dej        fd	Z
dej        dej        d
eeej        f         fdZed             Zed             Zed             Zddee         fdZeZeZeZ xZS )Whisperdimsc                 d   t                                                       || _        t          | j        j        | j        j        | j        j        | j        j        | j        j                  | _	        t          | j        j        | j        j        | j        j        | j        j        | j        j                  | _        t#          j        | j        j        | j        j        t"          j                  }d|| j        j        dz  d <   |                     d|                                d           d S )Nr.   TrA   alignment_headsFr   )r*   r]   r   r   r   r   r   r   r   encoderr   r   r   r   r   r   decoderrF   zerosr   r   	to_sparse)r/   r   	all_headsr0   s      r#   r]   zWhisper.__init__   s
   	#II!I#I"I#
 
 #II I"I!I"
 
 KI"DI$9
 
 
	 48	$)(A-//0.	0C0C0E0ERWXXXXXr"   dumpc                 t   t          j        t          j        t	          j        |                    t                                                    }t          j	        |          
                    | j        j        | j        j                  }|                     d|                                d           d S )Nr   r   Fr   )rD   
frombuffergzip
decompressbase64	b85decoder   copyrF   
from_numpyreshaper   r   r   r   r   )r/   r   arrayrc   s       r#   set_alignment_headszWhisper.set_alignment_heads  s    OF,T22334
 
 

$&& 	 &&..I"DI$9
 
 	.0@0@USSSSSr"   melc                 ,    |                      |          S r)   )r   )r/   r   s     r#   embed_audiozWhisper.embed_audio  s    ||C   r"   tokensaudio_featuresc                 .    |                      ||          S r)   )r   )r/   r   r   s      r#   r   zWhisper.logits"  s    ||FN333r"   r'   c                 T    |                      ||                     |                    S r)   )r   r   )r/   r   r   s      r#   r+   zWhisper.forward%  s$     ||FDLL$5$5666r"   c                 N    t          |                                           j        S r)   )r   
parametersdevicer/   s    r#   r   zWhisper.device*  s    DOO%%&&--r"   c                 "    | j         j        dk    S )Ni  )r   r   r   s    r#   is_multilingualzWhisper.is_multilingual.  s    y E))r"   c                 J    | j         j        dz
  t          | j                  z
  S )Ni5  )r   r   r   r   r   s    r#   num_languageszWhisper.num_languages2  s"    y 5(3t/C+D+DDDr"   Ncachec                      i ni g  fddt           j        ffd} j                            |           fS )a  
        The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value
        tensors calculated for the previous positions. This method returns a dictionary that stores
        all caches, and the necessary hooks for the key and value projection modules that save the
        intermediate tensors to be reused during later calculations.

        Returns
        -------
        cache : Dict[nn.Module, torch.Tensor]
            A dictionary object mapping the key/value projection modules to its cache
        hooks : List[RemovableHandle]
            List of PyTorch RemovableHandle objects to stop the hooks to be called
        Nc                     | vs|j         d         j        j        k    r|| <   n3t          j        |          |gd                                          | <   |          S )Nr   rB   )rr   r   r   rF   rJ   ry   )moduler   outputr   r/   s      r#   save_to_cachez5Whisper.install_kv_cache_hooks.<locals>.save_to_cacheG  se    U""fl1o	8L&L&L &f %	5=&*Aq I I I P P R Rf= r"   layerc                     t          | t                    r\                    | j                                                                     | j                                                 d S d S r)   )
isinstancerU   appendr_   register_forward_hookr`   )r   hooksr   s    r#   install_hooksz5Whisper.install_kv_cache_hooks.<locals>.install_hooksO  sm    %!344 OUY<<]KKLLLU[>>}MMNNNNNO Or"   )r
   Moduler   apply)r/   r   r   r   r   s   `` @@r#   install_kv_cache_hookszWhisper.install_kv_cache_hooks6  s     #.	5		B	! 	! 	! 	! 	! 	!	O 	O 	O 	O 	O 	O 	O 	O
 	=)))e|r"   r)   )r   r   r   r   r]   bytesr   rF   r	   r   r   r   strr+   propertyr   r   r   r   r   r   detect_language_functionr   transcribe_functionr   decode_functionr   r1   r2   s   @r#   r   r      sh       Y_ Y Y Y Y Y Y2T T T T T!u| ! ! ! !4U\ 45< 4 4 4 47<7).7	c5<	 7 7 7 7
 . . X. * * X* E E XE HTN    B /O$JFFFFFr"   r   )r?   )+r   r   
contextlibr   dataclassesr   typingr   r   r   r   numpyrD   rF   torch.nn.functionalr
   
functionalr6   r	   decodingr   r   r   r   r   r   r   ru   ImportErrorRuntimeErrorOSErrorr   r%   r4   r<   rS   rX   r   rU   r   r   r   r   r!   r"   r#   <module>r     s     % % % % % % ! ! ! ! ! ! 2 2 2 2 2 2 2 2 2 2 2 2                       / / / / / / A A A A A A 9 9 9 9 9 9@@@@@@NN\7+   #' NNN
 
 
 
 
 
 
 
 
8 8 8 8 8 8 8 8

 
 
 
 
RY 
 
 

 
 
 
 
RY 
 
 
N N N N 1 1 1: : : : : : : :z    RY   @    29   B* * * * *") * * *Z] ] ] ] ]bi ] ] ] ] ]s   A A%$A%