
    Sܶi}                     J   d dl mZmZmZ d dlmZmZmZmZm	Z	m
Z
mZmZ d dlZd dlZd dlmc mZ d dlmZ d dlmZ ddlmZ ddlmZmZ dd	lmZ erdd
lmZ  ej                     	 d1dddededeeee!         f         fd            Z" ed           G d d                      Z# ed           G d d                      Z$ G d d          Z% G d de%          Z& G d d          Z' G d de'          Z( G d d           Z) G d! d"e)          Z* G d# d$e)          Z+ G d% d&          Z, G d' d(e,          Z- G d) d*e,          Z. G d+ d,e,          Z/ G d- d.          Z0 ej                      e#            fddded/e#dee$ee$         f         fd0            Z1dS )2    )	dataclassfieldreplace)TYPE_CHECKINGDictIterableListOptionalSequenceTupleUnionN)Tensor)Categorical   )CHUNK_LENGTH)	Tokenizerget_tokenizer)compression_ratio)Whispermodelr   mel	tokenizerreturnc                   
 t          | j        | j                  j        j        j        vrt          d          |j        dk    }|r|                    d          }|j	        dd         | j
        j        | j
        j        fk    r|                     |          }|j	        d         }t          j        j        gg|z                                |j                  }|                     ||          dddf         }t          j        |j	        d         t          j                  }d	|t-          j                  <   t0          j         |dd|f<   |                    d
          }|                    d
                                          

fdt;          |          D             }	|r|d         }|	d         }	||	fS )ao  
    Detect the spoken language in the audio, and return them as list of strings, along with the ids
    of the most probable language tokens and the probability distribution over all language tokens.
    This is performed outside the main decode loop in order to not interfere with kv-caching.

    Returns
    -------
    language_tokens : Tensor, shape = (n_audio,)
        ids of the most probable language tokens, which appears after the startoftranscript token.
    language_probs : List[Dict[str, float]], length = n_audio
        list of dictionaries containing the probability distribution over all languages.
    N)num_languageszCThis model doesn't have language tokens so it can't perform lang id   r   )dtypeFdimc                 `    g | ])fd t          j        j                  D             *S )c                 P    i | ]"\  }}||f                                          #S  )item).0jcilanguage_token_probss      S/root/projects/openclaw-proxy/venv/lib/python3.11/site-packages/whisper/decoding.py
<dictcomp>z.detect_language.<locals>.<listcomp>.<dictcomp>B   sE     	
 	
 	
1 #AqD)..00	
 	
 	
    )zipall_language_tokensall_language_codes)r&   r)   r*   r   s    @r+   
<listcomp>z#detect_language.<locals>.<listcomp>A   sb       
 		
 	
 	
 	
 	
I99;WXX	
 	
 	
  r-   )r   is_multilingualr   languagelanguage_tokensot_sequence
ValueErrorndim	unsqueezeshapedimsn_audio_ctxn_audio_stateencodertorchtensorsottodevicelogitsonesboollistr/   npinfargmaxsoftmaxcpurange)r   r   r   singlen_audioxrC   masklanguage_tokenslanguage_probsr*   s     `       @r+   detect_languagerS      s     !!1D
 
 
	 	"#9+AAAQ
 
 	
 X]F mmA y~%*0%*2JKKKmmC   ilGy}o&01144SZ@@A\\!S!!!!!Q$'F :fl2&ej999D05Di+	,	,-vgF111d7Ommm++O!>>b>115577    
 w  N  +)!,'*N**r-   T)frozenc                      e Zd ZU dZeed<   dZee         ed<   dZe	ed<   dZ
ee         ed<   dZee         ed<   dZee         ed	<   dZee	         ed
<   dZee	         ed<   dZeeeee         f                  ed<   dZeeeee         f                  ed<   dZeeeee         f                  ed<   dZeed<   dZeed<   dZee	         ed<   dZeed<   dS )DecodingOptions
transcribetaskNr3   g        temperature
sample_lenbest_of	beam_sizepatiencelength_penaltypromptprefixz-1suppress_tokensTsuppress_blankFwithout_timestamps      ?max_initial_timestampfp16)__name__
__module____qualname__rX   str__annotations__r3   r
   rY   floatrZ   intr[   r\   r]   r^   r_   r   r	   r`   ra   r   rb   rE   rc   re   rf   r$   r-   r+   rV   rV   P   sh         D# #Hhsm""" K $J$$$!GXc]!!!#Ix}### $Hhuo$$$ '+NHUO*** /3FHU3S	>*+222.2FHU3S	>*+222 <@OXeC#$678???ND  %$$$-08E?000 D$r-   rV   c                       e Zd ZU eed<   eed<   dZeeee	f                  ed<    e
e          Zee         ed<   dZeed<   ej        Ze	ed	<   ej        Ze	ed
<   ej        Ze	ed<   ej        Ze	ed<   dS )DecodingResultaudio_featuresr3   NrR   )default_factorytokens textavg_logprobno_speech_probrY   r   )rg   rh   ri   r   rk   rj   rR   r
   r   rl   r   rF   rr   r	   rm   rt   rG   nanru   rv   rY   r   r$   r-   r+   ro   ro   u   s         MMM15NHT#u*-.555d333FDI333D#NNNKFNE"""K!vu%%%%%r-   ro   c                   2    e Zd ZdededefdZddZddZdS )		Inferencerr   rp   r   c                     t           )zAPerform a forward pass on the decoder and return per-token logitsNotImplementedErrorselfrr   rp   s      r+   rC   zInference.logits       !!r-   Nc                     t           )z9Update the key-value cache according to the updated beamsr{   )r~   source_indicess     r+   rearrange_kv_cachezInference.rearrange_kv_cache   r   r-   c                     dS )z:Clean up any resources or hooks after decoding is finishedNr$   r~   s    r+   cleanup_cachingzInference.cleanup_caching   s    r-   )r   N)rg   rh   ri   r   rC   r   r   r$   r-   r+   ry   ry      se        "V "V " " " " "" " " "     r-   ry   c                   >    e Zd ZdddefdZdededefdZd	 Zd
 ZdS )PyTorchInferencer   r   initial_token_lengthc                     || _         || _        i | _        g | _        d | j         j        j        D             }d | j         j        j        D             }||z   | _        d S )Nc                 &    g | ]}|j         j        S r$   )attnkeyr&   blocks     r+   r1   z-PyTorchInference.__init__.<locals>.<listcomp>   s    MMM%uz~MMMr-   c                 &    g | ]}|j         j        S r$   )r   valuer   s     r+   r1   z-PyTorchInference.__init__.<locals>.<listcomp>   s    QQQe)QQQr-   )r   r   kv_cachehooksdecoderblocks
kv_modules)r~   r   r   key_modulesvalue_moduless        r+   __init__zPyTorchInference.__init__   se     %
$8!
MM4:3E3LMMMQQtz7I7PQQQ%5r-   rr   rp   r   c                     | j         s&| j                                        \  | _         | _        |j        d         | j        k    r|d d dd f         }| j                            ||| j                   S )Nr   )r   )r   r   install_kv_cache_hooksr   r9   r   r   r}   s      r+   rC   zPyTorchInference.logits   sr    } 	L(,
(I(I(K(K%DM4:<d777AAArssF^Fz!!&.4=!QQQr-   c                 ^    | j         D ]}|                                 i | _        g | _         d S N)r   remover   )r~   hooks     r+   r   z PyTorchInference.cleanup_caching   s4    J 	 	DKKMMMM


r-   c                     |t          t          t          |                              k    r7| j        D ]1}| j        |         |                                         | j        |<   0d S d S r   )rF   rL   lenr   r   detach)r~   r   modules      r+   r   z#PyTorchInference.rearrange_kv_cache   ss    T%N(;(;"<"<====/ W W(,f(=n(M(T(T(V(Vf%% >=W Wr-   N)	rg   rh   ri   rm   r   r   rC   r   r   r$   r-   r+   r   r      s        6i 6s 6 6 6 6RV RV R R R R R  W W W W Wr-   r   c                   ^    e Zd Zdeee                  deee                  dee         fdZdS )SequenceRankerrr   sum_logprobsr   c                     t           )z
        Given a list of groups of samples and their cumulative log probabilities,
        return the indices of the samples in each group to select as the final result
        r{   r~   rr   r   s      r+   rankzSequenceRanker.rank   s
     "!r-   N)rg   rh   ri   r	   r   rl   rm   r   r$   r-   r+   r   r      sV        "4<("8<T%[8I"	c" " " " " "r-   r   c                   j    e Zd ZdZdee         fdZdeee                  deee                  fdZ	dS )MaximumLikelihoodRankerz
    Select the sample with the highest log probabilities, penalized using either
    a simple length normalization or Google NMT paper's length penalty
    r^   c                     || _         d S r   )r^   )r~   r^   s     r+   r   z MaximumLikelihoodRanker.__init__   s    ,r-   rr   r   c                 `      fdd |D             }fdt          ||          D             S )Nc                     g }t          | |          D ]7\  }}j        |}nd|z   dz  j        z  }|                    ||z             8|S )N      )r.   r^   append)logprobslengthsresultlogproblengthpenaltyr~   s         r+   scoresz,MaximumLikelihoodRanker.rank.<locals>.scores   sl    F#&x#9#9 1 1&.$GG !"F
a/D4GGGg/0000Mr-   c                 &    g | ]}d  |D             S )c                 ,    g | ]}t          |          S r$   r   r&   ts     r+   r1   z;MaximumLikelihoodRanker.rank.<locals>.<listcomp>.<listcomp>   s    &&&qCFF&&&r-   r$   )r&   ss     r+   r1   z0MaximumLikelihoodRanker.rank.<locals>.<listcomp>   s'    7771&&A&&&777r-   c                 R    g | ]#\  }}t          j         ||                    $S r$   )rG   rI   )r&   plr   s      r+   r1   z0MaximumLikelihoodRanker.rank.<locals>.<listcomp>   s1    OOODAq	&&A,,''OOOr-   )r.   )r~   rr   r   r   r   s   `   @r+   r   zMaximumLikelihoodRanker.rank   sV    		 		 		 		 		 87777OOOOCg4N4NOOOOr-   N)
rg   rh   ri   __doc__r
   rl   r   r	   r   r   r$   r-   r+   r   r      sy         
-x - - - -P4V- PT$u+=N P P P P P Pr-   r   c            
           e Zd Zd Zdedededeeef         fdZdededeeee                  e	e	e
                  f         fdZdS )	TokenDecoderc                     dS )z=Initialize any stateful variables for decoding a new sequenceNr$   r   s    r+   resetzTokenDecoder.reset   s      r-   rr   rC   r   r   c                     t           )a  Specify how to select the next token, based on the current trace and logits

        Parameters
        ----------
        tokens : Tensor, shape = (n_batch, current_sequence_length)
            all tokens in the context so far, including the prefix and sot_sequence tokens

        logits : Tensor, shape = (n_batch, vocab_size)
            per-token logits of the probability distribution at the current step

        sum_logprobs : Tensor, shape = (n_batch)
            cumulative log probabilities for each sequence

        Returns
        -------
        tokens : Tensor, shape = (n_batch, current_sequence_length + 1)
            the tokens, appended with the selected next token

        completed : bool
            True if all sequences has reached the end of text

        r{   )r~   rr   rC   r   s       r+   updatezTokenDecoder.update   s
    2 "!r-   c                     t           )a  Finalize search and return the final candidate sequences

        Parameters
        ----------
        tokens : Tensor, shape = (n_audio, n_group, current_sequence_length)
            all tokens in the context so far, including the prefix and sot_sequence

        sum_logprobs : Tensor, shape = (n_audio, n_group)
            cumulative log probabilities for each sequence

        Returns
        -------
        tokens : Sequence[Sequence[Tensor]], length = n_audio
            sequence of Tensors containing candidate token sequences, for each audio input

        sum_logprobs : List[List[float]], length = n_audio
            sequence of cumulative log probabilities corresponding to the above

        r{   r   s      r+   finalizezTokenDecoder.finalize   s
    , "!r-   N)rg   rh   ri   r   r   r   rE   r   r   r	   rl   r   r$   r-   r+   r   r      s        L L L""&,"<B"	vt|	" " " "6"",2"	x()4U+<<	=" " " " " "r-   r   c            
       V    e Zd ZdedefdZdedededeeef         fdZ	dedefd	Z
d
S )GreedyDecoderrY   eotc                 "    || _         || _        d S r   )rY   r   )r~   rY   r   s      r+   r   zGreedyDecoder.__init__  s    &r-   rr   rC   r   r   c                 B   | j         dk    r|                    d          }n*t          || j         z                                            }t	          j        |                                d          }|t          j        |j	        d                   |f         }|||d d df         | j
        k    z  z  }| j
        ||d d df         | j
        k    <   t          j        ||d d d f         gd          }|d d df         | j
        k                                    }||fS )Nr   r   r    )rC   )rY   rI   r   sampleFlog_softmaxrl   r>   aranger9   r   catall)r~   rr   rC   r   next_tokensr   current_logprobs	completeds           r+   r   zGreedyDecoder.update  s    q   --B-//KK%Vd6F-FGGGNNPPK=R888#EL1B$C$C[$PQ(F111b5MTX,EFF15F111b5MTX-.FK4$89rBBBAAArE]dh.3355	y  r-   c                 f    t          j        |d| j                  }||                                fS )N)r   r   )r   )r   padr   tolistr   s      r+   r   zGreedyDecoder.finalize'  s1    vvTX666|**,,,,r-   N)rg   rh   ri   rl   rm   r   r   r   rE   r   r   r$   r-   r+   r   r     s        E     !!&,!<B!	vt|	! ! ! !$-v -V - - - - - -r-   r   c            
       t    e Zd Z	 ddedededee         fdZd Zde	d	e	d
e	de
e	ef         fdZde	d
e	fdZdS )BeamSearchDecoderNr\   r   	inferencer]   c                     || _         || _        || _        |pd| _        t	          || j        z            | _        d | _        | j        dk    sJ d| d| d            d S )Nrd   r   zInvalid beam size (z) or patience ())r\   r   r   r]   roundmax_candidatesfinished_sequences)r~   r\   r   r   r]   s        r+   r   zBeamSearchDecoder.__init__.  s}     #" C#(T])B#C#C"& !###FFF8FFF $####r-   c                     d | _         d S r   )r   r   s    r+   r   zBeamSearchDecoder.reset@  s    "&r-   rr   rC   r   r   c                     |j         d          j        z  dk    rt          |j          d j         d          |j         d          j        z  } j        d t	          |          D              _        t          j        |                                d          }g g g }}}t	          |          D ]k}	i i i }}}
t	           j                  D ]}|	 j        z  |z   }||                                         }t          ||         
                     j        dz              D ]Q\  }}||         |z                                   }t          ||                                gz             }||
|<   |||<   Rd}t          |
|
j        d	          D ]y}|d          j        k    r|
|         ||<   |
|         |t!          |          <   |                    |           |                    ||                    |dz  }| j        k    r nz|                    |           mt%          j        ||j        
          } j                            |           t!           j                  t!          |          k    sJ t           j        |          D ]D\  }}t          ||j        d	          D ]'}t!          |           j        k    r n||         ||<   (Et1           fd j        D                       }||fS )Nr   z[0] % z != 0c                     g | ]}i S r$   r$   )r&   _s     r+   r1   z,BeamSearchDecoder.update.<locals>.<listcomp>K  s    &B&B&Bar&B&B&Br-   r   r    r   T)r   reverserB   c              3   H   K   | ]}t          |          j        k    V  d S r   )r   r   )r&   	sequencesr~   s     r+   	<genexpr>z+BeamSearchDecoder.update.<locals>.<genexpr>z  sE       
 
 	NNd11
 
 
 
 
 
r-   )r9   r\   r6   r   rL   r   r   rl   r   r.   topkr%   tuplesortedgetr   r   r   r>   r?   rB   r   r   r   r   )r~   rr   rC   r   rN   r   r   r   r   r)   r   sourcesfinishedr'   idxr`   r   tokennew_logprobsequencesavedpreviously_finishednewly_finishedseqr   s   `                        r+   r   zBeamSearchDecoder.updateC  sd    <?T^+q00IIDNIIIJJJ,q/T^3"*&B&B5>>&B&B&BD#=R888:<b"%7^w 	0 	0A(*BXGF 4>** , ,$.(1,++--&)8C=+=+=dnq>P+Q+Q&R , ,NGU#/#4w#>"D"D"F"FK$Vuzz||n%<==H'2F8$(+GH%%	, E"6vz4HHH 
 
B<48++)/)9HX&&5;H5EL[!1!12&&x000"))'(*;<<<QJE.. / %%h////k&-@@@)).999 4*++s3E/F/FFFFF36#%74
 4
 	? 	?/ n.2DdSSS ? ?*++t/BBBE+9#+>#C((  
 
 
 
!4
 
 
 
 
	 y  r-   preceding_tokensc                    |                                 }t          | j                  D ]\  }}t          |          | j        k     rt          t          j        ||                             d d d         D ]q}|||f                                         | j	        gz   }||         |         
                                |t          |          <   t          |          | j        k    r nrd | j        D             }d | j        D             }||fS )Nr   c                 J    g | ] }d  |                                 D             !S )c                 6    g | ]}t          j        |          S r$   )r>   r?   )r&   r   s     r+   r1   z9BeamSearchDecoder.finalize.<locals>.<listcomp>.<listcomp>  s"    ;;;3U\#;;;r-   )keysr&   r   s     r+   r1   z.BeamSearchDecoder.finalize.<locals>.<listcomp>  s@     &
 &
 &
 <;)..*:*:;;;&
 &
 &
r-   c                 P    g | ]#}t          |                                          $S r$   )rF   valuesr   s     r+   r1   z.BeamSearchDecoder.finalize.<locals>.<listcomp>  s9     +
 +
 +
)2D!!##$$+
 +
 +
r-   )rK   	enumerater   r   r\   rF   rG   argsortr   r   r%   r   )r~   r   r   r)   r   r'   r   rr   s           r+   r   zBeamSearchDecoder.finalize  s)   #''))%d&=>> 	 	LAyI//bja99::44R4@  A/15<<>>$(KH1=a1C1H1H1J1JIeHoo.9~~77 8&
 &
!4&
 &
 &
+
 +
6:6M+
 +
 +
 |##r-   r   )rg   rh   ri   rm   ry   r
   rl   r   r   r   r   rE   r   r   r$   r-   r+   r   r   -  s         %)G GG G 	G
 5/G G G G$' ' ';!;!&,;!<B;!	vt|	;! ;! ;! ;!z$ $v $ $ $ $ $ $r-   r   c                   "    e Zd ZdededdfdZdS )LogitFilterrC   rr   r   Nc                     t           )a  Apply any filtering or masking to logits in-place

        Parameters
        ----------
        logits : Tensor, shape = (n_batch, vocab_size)
            per-token logits of the probability distribution at the current step

        tokens : Tensor, shape = (n_batch, current_sequence_length)
            all tokens in the context so far, including the prefix and sot_sequence tokens

        r{   r~   rC   rr   s      r+   applyzLogitFilter.apply  s
     "!r-   )rg   rh   ri   r   r  r$   r-   r+   r  r    s=        "F "F "t " " " " " "r-   r  c                   .    e Zd ZdedefdZdedefdZdS )SuppressBlankr   sample_beginc                 "    || _         || _        d S r   )r   r  )r~   r   r  s      r+   r   zSuppressBlank.__init__  s    "(r-   rC   rr   c                     |j         d         | j        k    r<t          j         |d d | j                            d          | j        j        gz   f<   d S d S )Nr    )r9   r  rG   rH   r   encoder   r  s      r+   r  zSuppressBlank.apply  sX    <?d///LNF7F111dn++C00DN4F3GGGHHH 0/r-   N)rg   rh   ri   r   rm   r   r   r  r$   r-   r+   r
  r
    s`        )) )3 ) ) ) )SF SF S S S S S Sr-   r
  c                   6    e Zd Zdee         fdZdedefdZdS )SuppressTokensra   c                 .    t          |          | _        d S r   )rF   ra   r~   ra   s     r+   r   zSuppressTokens.__init__  s    #O44r-   rC   rr   c                 8    t           j         |d d | j        f<   d S r   )rG   rH   ra   r  s      r+   r  zSuppressTokens.apply  s"    +-6'qqq$&&'''r-   N)rg   rh   ri   r   rm   r   r   r  r$   r-   r+   r  r    sU        5 5 5 5 52F 2F 2 2 2 2 2 2r-   r  c                   >    e Zd Zdededee         fdZdedefdZdS )	ApplyTimestampRulesr   r  max_initial_timestamp_indexc                 0    || _         || _        || _        d S r   )r   r  r  )r~   r   r  r  s       r+   r   zApplyTimestampRules.__init__  s!     #(+F(((r-   rC   rr   c                 L   | j         j        t          j         |d d | j         j        f<   t	          |j        d                   D ]8}||| j        d f         }d |                                D             }t          |          dk    o|d         | j         j	        k    }t          |          dk     p|d         | j         j	        k    }|r?|rt          j         ||| j         j	        d f<   nt          j         ||d | j         j
        f<   ||                    | j         j	                           }|                                dk    r6|r|s	|d         }	n|d         dz   }	t          j         ||| j         j	        |	f<   :|j        d         | j        k    rTt          j         |d d d | j         j	        f<   | j        -| j         j	        | j        z   }
t          j         |d d |
dz   d f<   t          j        |                                d          }t	          |j        d                   D ]x}||| j         j	        d f                             d          }||d | j         j	        f                                         }||k    rt          j         ||d | j         j	        f<   yd S )Nr   c                     g | ]}|S r$   r$   r   s     r+   r1   z-ApplyTimestampRules.apply.<locals>.<listcomp>  s    6661666r-   r   r   r   r   r    )r   no_timestampsrG   rH   rL   r9   r  r   r   timestamp_beginr   genumelr  r   r   rl   	logsumexpmax)r~   rC   rr   ksampled_tokensr   last_was_timestamppenultimate_was_timestamp
timestampstimestamp_lastlast_allowedr   timestamp_logprobmax_text_token_logprobs                 r+   r  zApplyTimestampRules.apply  s   >'379vgF111dn223 v|A'' 	U 	UA#At'8':':$:;N66n3355666CCAK#b'T^-K"K  C1IB4>+I I & " >, >CE6'F1dn<>>>??79vgF12 2223'!!$."@AAJ !!A%% & 8.G 8%/^NN%/^a%7NNPfWq$.8>IIJ<?d///;=6'F11166667 /;N2T5UU  24qqq,*,,,- =R888v|A'' 	F 	FA (DN,J,L,L)L M W W !X ! ! &.a1Q4>3Q1Q.Q%R%V%V%X%X" #999?Avgq:DN:::;	F 	Fr-   N)	rg   rh   ri   r   rm   r
   r   r   r  r$   r-   r+   r  r    sy        GG G &.c]	G G G G5FF 5FF 5F 5F 5F 5F 5F 5Fr-   r  c                      e Zd ZU eed<   eed<   eed<   ee         ed<   ddde	fdZ
de	d	e	fd
Zd	ee         fdZd	ee         fdZdefdZdedefdZdedefdZ ej                    ded	ee         fd            ZdS )DecodingTaskr   sequence_rankerr   logit_filtersr   r   optionsc                 :   || _         |j        pd}t          |j        |j        ||j                  }|| _        |                     |          | _        |j	        p|j
        pd| _        |j        j        | _        |j        p|j        j        dz  | _        |j        | _        | j        j        r|j        | _        |                                 | _        t+          | j                  | _        | j                            |j                  | _        t5          |t+          | j                            | _        t9          |j                  | _        |j	        ,t?          |j	        |j         | j        |j!                  | _"        ntG          |j$        |j                   | _"        g | _%        | j        j&        r2| j%        '                    tQ          | j        | j                             | j        j)        r9| j%        '                    tU          | +                                                     |j        sitX          |j        j-        z  }d }|j.        rt_          | j        j.        |z            }| j%        '                    ta          || j        |                     d S d S )Nen)r   r3   rX   r   r   )1r   r3   r   r2   r   rX   r   _verify_optionsr.  r\   r[   n_groupr:   
n_text_ctxn_ctxrZ   r5   rc   #sot_sequence_including_notimestamps_get_initial_tokensinitial_tokensr   r  indexr@   	sot_indexr   r   r   r^   r,  r   r   r]   r   r   rY   r-  rb   r   r
  ra   r  _get_suppress_tokensr   r;   re   r   r  )r~   r   r.  r3   r   	precisionr  s          r+   r   zDecodingTask.__init__  s   
#+t!!-	
 
 
	 %.(,(<(<W(E(E#-EEA*/
&1OUZ5Ja5O(1(><* 	N ) MD*.*B*B*D*D!$T%8!9!9"177	FF *%T5H1I1IJJ  7w7MNN (,!9=$.'BR DLL ))<imLLDL  <& 	X%%mDNDDU&V&VWWW<' 	S%%nT5N5N5P5P&Q&QRRR) 	$uz'==I*.', .3L6B/ /+ %%#t02M     	 	r-   r   c                    |j         |j        t          d          |j        dk    r|j        t          d          |j        |j         t          d          |j        #d|j        cxk    rdk    sn t          d          |S )Nz-beam_size and best_of can't be given togetherr   z4best_of with greedy sampling (T=0) is not compatiblez'patience requires beam_size to be givenr   z8length_penalty (alpha) should be a value between 0 and 1)r\   r[   r6   rY   r]   r^   )r~   r.  s     r+   r1  zDecodingTask._verify_options<  s    (W_-HLMMM!##* !WXXX'G,=,EFGGG!-',,,,1,,,,WXXXr-   c                 8   t          | j                  }| j        j        x}rot	          |t
                    r/| j                            d|                                z             n|}| j	        | j
        dz  | j	        z
  }|| d          }||z   }| j        j        x}rmt	          |t
                    r/| j                            d|                                z             n|}| j        j        g|| j
        dz  dz
   d          z   |z   }t          |          S )Nr  r   r   )rF   r5   r.  r`   
isinstancerj   r   r  striprZ   r4  r_   sot_prevr   )r~   rr   r`   prefix_tokensmax_prefix_lenr_   prompt_tokenss          r+   r6  z DecodingTask._get_initial_tokensK  s3   d'((\((6 		, fc**%%cFLLNN&:;;; 
 *!%q4?!B -~o.>.> ?m+F\((6 
	 fc**%%cFLLNN&:;;;  ()$*/A"5 6 8 89:  V}}r-   c                    | j         j        }t          |t                    rd |                    d          D             }d|v r,d |D             }|                    | j        j                   n7|t          |          dk    rg }nt          |t                    s
J d            |                    | j        j
        | j        j        | j        j        | j        j        | j        j        g           | j        j        |                    | j        j                   t#          t%          t'          |                              S )Nc                 ,    g | ]}t          |          S r$   )rm   r   s     r+   r1   z5DecodingTask._get_suppress_tokens.<locals>.<listcomp>k  s    JJJ!s1vvJJJr-   ,r   c                     g | ]
}|d k    |S )r   r$   r   s     r+   r1   z5DecodingTask._get_suppress_tokens.<locals>.<listcomp>n  s    DDDQQ!VVqVVVr-   r   zsuppress_tokens must be a list)r.  ra   r>  rj   splitextendr   non_speech_tokensr   rF   rW   	translater@   r@  sot_lm	no_speechr   r   r   setr  s     r+   r:  z!DecodingTask._get_suppress_tokensg  sA   ,6os++ 	KJJ/D/DS/I/IJJJO  DD/DDDO""4>#CDDDD$O(<(<(A(A OOot44VV6VVVV)("'%	
 	
 	
 >#/""4>#;<<<VC0011222r-   r   c                 n   | j         j        r|                                }|j        dd          | j        j        j        | j        j        j        fk    r|}n| j                            |          }|j	        | j         j        rt          j        nt          j        k    rt          d|j	                   S |S )Nr   z'audio_features has an incorrect dtype: )r.  rf   halfr9   r   r:   r;   r<   r=   r   r>   float16float32	TypeError)r~   r   rp   s      r+   _get_audio_featuresz DecodingTask._get_audio_features  s    < 	((**C9RSS>JO'JO)
 
 

 !NN!Z//44N!\.AEMMEM
 
 P.:NPP   r-   rp   rr   c                    | j         j        g|j        d         z  }d }| j         j        | j         j        dk    rL| j                            || j                  \  }}d |D             }| j         j        ||d d | j        dz   f<   ||fS )Nr   lang_idc                 :    g | ]}t          ||j                   S ))r   )r   r   )r&   probss     r+   r1   z1DecodingTask._detect_language.<locals>.<listcomp>  s'    KKKuU	222KKKr-   r   )r.  r3   r9   rX   r   rS   r   r9  )r~   rp   rr   	languages
lang_probslang_tokenss         r+   _detect_languagezDecodingTask._detect_language  s    \*+n.B1.EE	
< (DL,=,J,J&*j&@&@' '#K LK
KKKI|$,0;qqq$.1,,-*$$r-   c                    |j         d         }t          j        ||j                  }t          j        g|z  }	 t          | j                  D ]}| j        	                    ||          }|dk    rk| j
        j        _|d d | j        f                                                             d          }|d d | j
        j        f                                         }|d d df         }| j        D ]}	|	                    ||           | j                            |||          \  }}
|
s|j         d         | j        k    r n| j                                         n# | j                                         w xY w|||fS )Nr   r   r   r    )r9   r>   zerosrB   rG   rw   rL   rZ   r   rC   r   rM  r9  rl   rJ   r   r-  r  r   r   r4  r   )r~   rp   rr   n_batchr   no_speech_probsr)   rC   probs_at_sotlogit_filterr   s              r+   
_main_loopzDecodingTask._main_loop  s   ,q/${7>;PQQQ6(W,	-4?++  ..v~FF FFt~7C#)!!!T^*;#<#B#B#D#D#L#LQS#L#T#TL&2111dn6N3N&O&V&V&X&XO  2 %)$6 7 7L &&vv6666 %)L$7$7$U$U!	 R 04: = =E !> N**,,,,DN**,,,,|_44s   DE E7c           	      V     j                                           j        |j        d         }                     |          }t          j         j        g                              |d          } 	                    ||          \  }} j
        j        dk    rd t          |||          D             S |                     j        d                              |j                  }                     ||          \  }}}|d d  j                 }|d d  j                 }|j        d         t%          |          cxk    r|k    sn J |                    | j        d          }|                    | j                  } j                             ||          \  }} fd|D             } j                            ||          }	d t          |	|          D             }fd	|D             }
d
 t          |	|          D             }d t          ||          D             }|
|||||f}t%          t/          t1          t$          |                              dk    r2t3          dt5          t1          t$          |                                fdt          | D             S )Nr   r   rV  c                 :    g | ]\  }}}t          |||           S ))rp   r3   rR   )ro   )r&   featuresr3   rX  s       r+   r1   z$DecodingTask.run.<locals>.<listcomp>  sF        .Hh #+hu    r-   r    r   c                 .    g | ]}fd |D             S )c                 p    g | ]2}|j         |j        k                                    d                   3S ))r   r   )r  r   nonzero)r&   r   r~   r   s     r+   r1   z/DecodingTask.run.<locals>.<listcomp>.<listcomp>  s>    TTTQQt A$6#?#?#A#A$#GGHTTTr-   r$   )r&   r   r~   r   s     r+   r1   z$DecodingTask.run.<locals>.<listcomp>  sB     &
 &
 &
 UTTTTRSTTT&
 &
 &
r-   c                 H    g | ]\  }}||                                           S r$   )r   )r&   r)   r   s      r+   r1   z$DecodingTask.run.<locals>.<listcomp>  s(    "S"S"STQ1Q4;;=="S"S"Sr-   c                 ^    g | ])}                     |                                          *S r$   )decoder?  )r&   r   r   s     r+   r1   z$DecodingTask.run.<locals>.<listcomp>  s3    HHHAI,,Q//5577HHHr-   c                 $    g | ]\  }}||         S r$   r$   )r&   r)   lps      r+   r1   z$DecodingTask.run.<locals>.<listcomp>  s     $T$T$Tuq"RU$T$T$Tr-   c                 >    g | ]\  }}|t          |          d z   z  S )r   r   )r&   r   rn  s      r+   r1   z$DecodingTask.run.<locals>.<listcomp>  s8     %
 %
 %
"'!RB#a&&1*%
 %
 %
r-   zinconsistent result lengths: c                 z    g | ]7\  }}}}}}t          ||||||j        j        t          |                     8S ))rp   r3   rr   rt   ru   rv   rY   r   )ro   r.  rY   r   )r&   rt   r3   rr   rf  ru   rv   r~   s          r+   r1   z$DecodingTask.run.<locals>.<listcomp>  sl     
 
 
 Nh+~ '!'- L4"3D"9"9	 	 	
 
 
r-   )r   r   r   r9   rT  r>   r?   r7  repeatr\  r.  rX   r.   repeat_interleaver2  rA   rB   rc  r   reshaper   r,  r   rN  mapRuntimeErrorrF   )r~   r   rN   rp   rr   rY  rR   r   r`  selectedtextsavg_logprobsfieldsr   s   `            @r+   runzDecodingTask.run  s   #~	y|!%!9!9#!>!>t':&;<<CCGQOO %)$9$9.&$Q$Q!	><	))  25"I~2 2	    ))$,A)>>AA.BWXX 15PV0W0W-o (4<8)//T\/:#A&#o*>*>IIII'IIIIIIr::#++GT\BB  $|44V\JJ&
 &
 &
 &
 &
&
 &
 &
 ',,V\BB"S"SS6=R=R"S"S"SHHHHHHH$T$THl8S8S$T$T$T%
 %
+.v|+D+D%
 %
 %

 
 s3sF##$$%%**WtCVDTDT?U?UWWXXX
 
 
 
 RUR
 
 
 	
r-   N)rg   rh   ri   ry   rk   r   r   r	   r  rV   r   r1  r   rm   r6  r:  r   rT  r\  rc  r>   no_gradro   rz  r$   r-   r+   r+  r+    sq        ####$$$$8i 8/ 8 8 8 8t ?    U3Z    83eCj 3 3 3 3:v    ,%v %v % % % %5 5 5 5 5 5@ U]__L
v L
$~"6 L
 L
 L
 _L
 L
 L
r-   r+  r.  c                     |j         dk    x}r|                    d          }|rt          |fi |}t          | |                              |          }|r|d         n|S )a;  
    Performs decoding of 30-second audio segment(s), provided as Mel spectrogram(s).

    Parameters
    ----------
    model: Whisper
        the Whisper model instance

    mel: torch.Tensor, shape = (80, 3000) or (*, 80, 3000)
        A tensor containing the Mel spectrogram(s)

    options: DecodingOptions
        A dataclass that contains all necessary options for decoding 30-second segments

    Returns
    -------
    result: Union[DecodingResult, List[DecodingResult]]
        The result(s) of decoding contained in `DecodingResult` dataclass instance(s)
    r   r   )r7   r8   r   r+  rz  )r   r   r.  kwargsrM   r   s         r+   rl  rl    su    4 Qv mmA -',,V,,%))--c22F*6!99F*r-   r   )2dataclassesr   r   r   typingr   r   r   r	   r
   r   r   r   numpyrG   r>   torch.nn.functionalnn
functionalr   r   torch.distributionsr   audior   r   r   r   utilsr   r   r   r{  dictrS   rV   ro   ry   r   r   r   r   r   r   r  r
  r  r  r+  rl  r$   r-   r+   <module>r     s   1 1 1 1 1 1 1 1 1 1 X X X X X X X X X X X X X X X X X X X X                     + + + + + +       / / / / / / / / $ $ $ $ $ $  :>:+ :+:+!:+.7:+
64::+ :+ :+ :+z $! ! ! ! ! ! ! !H $	& 	& 	& 	& 	& 	& 	& 	&        W  W  W  W  Wy  W  W  WF" " " " " " " "P P P P Pn P P P45" 5" 5" 5" 5" 5" 5" 5"p- - - - -L - - -:g$ g$ g$ g$ g$ g$ g$ g$T" " " " " " " " S S S S SK S S S2 2 2 2 2[ 2 2 2@F @F @F @F @F+ @F @F @FFY
 Y
 Y
 Y
 Y
 Y
 Y
 Y
x   /00!+ !+!+	!+ !+
 >4//0!+ !+ !+ !+ !+ !+r-   