
    Sܶi20                        d dl Z d dlZd dlZd dlmZmZ d dlmZmZ d dl	m
Z
mZmZmZ d dlZi ddddd	d
dddddddddddddddddddddd d!d"d#d$d%d&i d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHi dIdJdKdLdMdNdOdPdQdRdSdTdUdVdWdXdYdZd[d\d]d^d_d`dadbdcdddedfdgdhdidji dkdldmdndodpdqdrdsdtdudvdwdxdydzd{d|d}d~ddddddddddddddi ddddddddddddddddddddddddddddddddddddddddddddddddddZi d e                                D             dddddddd7d7ddddZe G dÄ dĦ                      Z edŦ          ddedefdʄ            Z edŦ          dddd˜dededee         dee         def
dЄ            ZdS )    N)	dataclassfield)cached_property	lru_cache)DictListOptionalTupleenenglishzhchinesedegermanesspanishrurussiankokoreanfrfrenchjajapanesept
portuguesetrturkishplpolishcacatalannldutchararabicsvswedishititalianid
indonesianhihindififinnishvi
vietnamesehehebrewuk	ukrainianelgreekmsmalaycsczechroromaniandadanishhu	hungariantatamilno	norwegianththaiururduhrcroatianbg	bulgarianlt
lithuanianlalatinmimaoriml	malayalamcywelshskslovaktetelugufapersianlvlatvianbnbengalisrserbianazazerbaijanisl	slovenianknkannadaetestonianmk
macedonianbrbretoneubasqueis	icelandichyarmeniannenepalimn	mongolianbsbosniankkkazakhsqalbanianswswahiliglgalicianmrmarathipapunjabisisinhalakmkhmersnshonayoyorubasosomaliaf	afrikaansococcitankageorgianbe
belarusiantgtajiksdsindhigugujaratiamamharicyiyiddishlolaouzuzbekfofaroesehtzhaitian creolepspashtotkturkmennnnynorskmtmaltesesanskritluxembourgishmyanmartibetantagalogmalagasyassamesetatarhawaiianlingalahausabashkirjavanese	sundanese	cantonese)salbmybotlmgastthawlnhabajwsuyuec                     i | ]\  }}||	S  r   ).0codelanguages      T/root/projects/openclaw-proxy/venv/lib/python3.11/site-packages/whisper/tokenizer.py
<dictcomp>r   s   s    >>>.$x>>>    r   r   )burmese	valencianflemishhaitianletzeburgeschpushtopanjabi	moldavianmoldovan	sinhalese	castilianmandarinc                      e Zd ZU dZej        ed<   eed<   dZe	e
         ed<   dZe	e
         ed<   dZee         ed<    ee	          Zee
ef         ed
<   d Zd Zdee         de
fdZdee         de
fdZedefd            Zedefd            Zedefd            Zedefd            Zedefd            Zedefd            Zedefd            Zedefd            Zedefd            Z edefd            Z!d Z"edee         fd            Z#edee
         fd            Z$edee         fd            Z%edee         fd            Z&d ee         fd!Z'd ee         fd"Z(d ee         fd#Z)dS )$	TokenizerzIA thin wrapper around `tiktoken` providing quick access to special tokensencodingnum_languagesNr   taskr   sot_sequence)default_factoryspecial_tokensc                    | j         j        D ]&}| j                             |          }|| j        |<   '| j        d         }| j        d         }| j        d         }t	          t
                                                    d | j                 }|g}| j        3|	                    |dz   |
                    | j                  z              | j        $| j        dk    r|n|}|	                    |           t	          |          | _        d S )N<|startoftranscript|><|translate|><|transcribe|>   
transcribe)r   special_tokens_setencode_single_tokenr   tuple	LANGUAGESkeysr   r   appendindexr   r   )	selfspecialspecial_tokensot	translater   langsr   
task_tokens	            r   __post_init__zTokenizer.__post_init__   s
   }7 	9 	9G M==gFFM+8D((&'>?,_=	-.>?
inn&&''(<$*<(<=u=$a%++dm*D*D DEEE9 ,0I,E,Ejj9J
+++!,//r   c                 (     | j         j        |fi |S N)r   encode)r   textkwargss      r   r  zTokenizer.encode   s    #t}#D33F333r   	token_idsreturnc                 F      fd|D             }  j         j        |fi |S )Nc                 *    g | ]}|j         k     |S r   )timestamp_begin)r   tr   s     r   
<listcomp>z$Tokenizer.decode.<locals>.<listcomp>   s&    FFF1Q1E-E-EQ-E-E-Er   r   decoder   r  r  s   `  r   r  zTokenizer.decode   s9    FFFF	FFF	#t}#I88888r   c                 (     | j         j        |fi |S )z
        Timestamp tokens are above other special tokens' id range and are ignored by `decode()`.
        This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
        r  r  s      r   decode_with_timestampsz Tokenizer.decode_with_timestamps   s!    
 $t}#I88888r   c                     | j         j        S r  )r   	eot_tokenr   s    r   eotzTokenizer.eot   s    }&&r   c                     | j         d         S )Nr   r   r  s    r   r   zTokenizer.transcribe   s    "#344r   c                     | j         d         S )Nr   r  r  s    r   r   zTokenizer.translate       "?33r   c                     | j         d         S )Nr   r  r  s    r   r   zTokenizer.sot   s    "#:;;r   c                     | j         d         S )N<|startoflm|>r  r  s    r   sot_lmzTokenizer.sot_lm   r  r   c                     | j         d         S )N<|startofprev|>r  r  s    r   sot_prevzTokenizer.sot_prev   s    "#455r   c                     | j         d         S )N<|nospeech|>r  r  s    r   	no_speechzTokenizer.no_speech   s    ">22r   c                     | j         d         S )N<|notimestamps|>r  r  s    r   no_timestampszTokenizer.no_timestamps   s    "#566r   c                     | j         d         S )Nz<|0.00|>r  r  s    r   r  zTokenizer.timestamp_begin   s    ":..r   c                 b    | j         t          d          |                     | j                   S )zGReturns the token id corresponding to the value of the `language` fieldNz6This tokenizer does not have language token configured)r   
ValueErrorto_language_tokenr  s    r   language_tokenzTokenizer.language_token   s1     = UVVV%%dm444r   c                 n    | j                             d| dd           x}r|S t          d| d          )N<||>z	Language z not found in tokenizer.)r   getKeyError)r   r   tokens      r   r,  zTokenizer.to_language_token   sM    '++,=,=,=,=tDDD5 	LE8EEEFFFr   c                     g }| j                                         D ]6\  }}|                    d          t          v r|                    |           7t          |          d | j                 S )N<|>)r   itemsstripr   r   r   r   )r   resultr3  token_ids       r   all_language_tokenszTokenizer.all_language_tokens   sm    #288:: 	( 	(OE8{{5!!Y..h'''V}}1t1122r   c                 D     t           fd j        D                       S )Nc              3   j   K   | ]-}                     |g                              d           V  .dS )r5  N)r  r7  )r   _lr   s     r   	<genexpr>z/Tokenizer.all_language_codes.<locals>.<genexpr>   s?      WWT[["&&,,U33WWWWWWr   )r   r:  r  s   `r   all_language_codeszTokenizer.all_language_codes   s)    WWWWd>VWWWWWWr   c                 V    t          t          | j                  | j        gz             S r  )r   listr   r(  r  s    r   #sot_sequence_including_notimestampsz-Tokenizer.sot_sequence_including_notimestamps   s&    T$+,,0B/CCDDDr   c                 \   t          d          }|d                                z  }t          d          }t          d |D                       sJ | j                            d          d         | j                            d          d         h}|t          |          z   D ]n}| j                            |          | j                            d|z             fD ]4}t          |          d	k    s||v r|                    |d                    5ot          t          |                    S )
u  
        Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
        annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.

        - ♪♪♪
        - ( SPEAKING FOREIGN LANGUAGE )
        - [DAVID] Hey there,

        keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
        u#   "#()*+/:;<=>@[\]^_`{|}~「」『』uK   << >> <<< >>> -- --- -( -[ (' (" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪u   ♩♪♫♬♭♮♯c              3   P   K   | ]!}d t          |          cxk    odk    nc V  "dS )i@&  i&  N)ord)r   cs     r   r>  z.Tokenizer.non_speech_tokens.<locals>.<genexpr>  sD      EE!6SVV----v----EEEEEEr   z -r   z ' r   )
rA  splitsetallr   r  lenaddr   sorted)r   symbolsmiscellaneousr8  symboltokenss         r   non_speech_tokenszTokenizer.non_speech_tokens   s:    =>>Z``bb	
 344EE}EEEEEEEE -&&t,,Q/1E1Ed1K1KA1NO] 3 33 	* 	*F$$V,,$$S6\22 * * v;;!##v'>'>JJvay)))* VF^^$$$r   rQ  c                 h    | j         dv r|                     |          S |                     |          S )N>   r   r   r   rG   r   r   )r   split_tokens_on_unicodesplit_tokens_on_spaces)r   rQ  s     r   split_to_word_tokenszTokenizer.split_to_word_tokens  s:    =AAA //777**6222r   c                 n   |                      |          }d}g }g }g }d}|D ]}|                    |           |                      |          }	||	vs"|||	                    |          z            |k    r>|                    |	           |                    |           g }|t          |	          z  }||fS )Nu   �r   )r  r   r   rK  )
r   rQ  decoded_fullreplacement_charwordsword_tokenscurrent_tokensunicode_offsetr3  decodeds
             r   rT  z!Tokenizer.split_tokens_on_unicode  s    226::# 	/ 	/E!!%(((11.AAG !//?O1P1P PQ#$ $ W%%%"">222!##g,,.k!!r   c                    |                      |          \  }}g }g }t          ||          D ]\  }}|d         | j        k    }|                    d          }	|                                t
          j        v }
|s|	s|
st          |          dk    r+|                    |           |                    |           |d         |z   |d<   |d         	                    |           ||fS )Nr   rG  )
rT  zipr  
startswithr7  stringpunctuationrK  r   extend)r   rQ  subwordssubword_tokens_listrZ  r[  subwordsubword_tokensr   
with_spacerd  s              r   rU  z Tokenizer.split_tokens_on_spaces7  s   (,(D(DV(L(L%%'*85H'I'I 		7 		7#G^$Q'483G ++C00J!--//V-??K 7* 7 7s5zzQW%%%"">2222!"I/b	B&&~6666k!!r   )*__name__
__module____qualname____doc__tiktokenEncoding__annotations__intr   r	   strr   r   r
   r   dictr   r   r  r  r   r  r  r   r  r   r   r   r  r"  r%  r(  r  r-  r,  r:  r?  rB  rR  rV  rT  rU  r   r   r   r   r      s        SS"Hhsm"""D(3-!L%*!!!%*U4%@%@%@NDcN@@@0 0 0&4 4 49S	 9 9 9 9 99S	 9 9 9 9 9 'S ' ' ' _' 5C 5 5 5 _5 43 4 4 4 _4 <S < < < _< 4 4 4 4 _4 6# 6 6 6 _6 33 3 3 3 _3 7s 7 7 7 _7 / / / / _/ 5 5 5 5 _5G G G 3U3Z 3 3 3 _3 XE#J X X X _X EU3Z E E E _E !%5: !% !% !% _!%F349 3 3 3 3"d3i " " " "2"T#Y " " " " " "r   r   )maxsizegpt2c   namer   c                 B   t           j                            t           j                            t                    d|  d          }d d t          |          D             D             }t          |          }i }ddgd t          t          	                                          d |         D             dd	d
dddd t          d          D             }|D ]}|||<   |dz  }t          j        t           j                            |          |d||          S )Nassetsz	.tiktokenc                 X    i | ]'\  }}t          j        |          t          |          (S r   )base64	b64decoderr  )r   r3  ranks      r   r   z get_encoding.<locals>.<dictcomp>M  s?       E4 	T  r   c              3   B   K   | ]}||                                 V  d S r  )rH  )r   lines     r   r>  zget_encoding.<locals>.<genexpr>O  s/      NNTNDJJLLNNNNNNr   z<|endoftext|>r   c                     g | ]}d | d	S )r/  r0  r   )r   langs     r   r  z get_encoding.<locals>.<listcomp>W  s     	K	K	KD-t---	K	K	Kr   r   r   r  r!  r$  r'  c                 "    g | ]}d |dz  ddS )r/  g{Gz?z.2fr0  r   )r   is     r   r  z get_encoding.<locals>.<listcomp>^  s+    	5	5	5A
q4x



	5	5	5r   i  r   zJ's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+)rx  explicit_n_vocabpat_strmergeable_ranksr   )ospathjoindirname__file__openrK  rA  r   r   rangero  rp  basename)rx  r   
vocab_pathranksn_vocabr   specialsr3  s           r   get_encodingr  J  sr   bgooh77dCUCUCUVVJ NNT*5E5ENNN  E %jjGN 	 
L	KD)9)9$:$:>M>$J	K	K	K 		
 	 	 	 	 	 
6	5t	5	5	5H    'u1Wj)) a%   r   )r   r   r   multilingualr   r   r	  c                    |F|                                 }|t          vr)|t          v rt          |         }nt          d|           | rd}|pd}|pd}nd}d }d }t	          ||          }t          ||||          S )NzUnsupported language: r  r   r   rv  )rx  r   )r   r   r   r   )lowerr   TO_LANGUAGE_CODEr+  r  r   )r  r   r   r   encoding_namer   s         r   get_tokenizerr  n  s     >>##9$$++++H5 !D(!D!DEEE &#t#|mLLLHPT   r   )rv  rw  )r|  r  rc  dataclassesr   r   	functoolsr   r   typingr   r   r	   r
   ro  r   r6  r  r   rs  rr  r  boolr  r   r   r   <module>r     s    				  ( ( ( ( ( ( ( ( 0 0 0 0 0 0 0 0 . . . . . . . . . . . . e)e)e 	(e 	)	e
 	)e 	(e 	(e 	*e 	,e 	)e 	(e 	)e 	'e 	(e 	)e  	)!e" 	,#e e$ 	'%e& 	)'e( 	,)e* 	(+e, 	+-e. 	'/e0 	'1e2 	'3e4 	*5e6 	(7e8 	+9e: 	';e< 	+=e> 	&?e@ 	&AeB 	*CeD 	+Ee e eF 	,GeH 	'IeJ 	'KeL 	+MeN 	'OeP 	(QeR 	(SeT 	)UeV 	)WeX 	)YeZ 	)[e\ 	-]e^ 	+_e` 	)aeb 	*ced 	,eef 	(ge e eh 	(iej 	+kel 	*men 	(oep 	+qer 	)set 	(uev 	*wex 	)yez 	*{e| 	)}e~ 	)e@ 	)AeB 	'CeD 	'EeF 	(GeH 	(Ie e eJ 	+KeL 	)MeN 	*OeP 	,QeR 	'SeT 	(UeV 	*WeX 	)YeZ 	)[e\ 	%]e^ 	'_e` 	)aeb 	
ced 	(eef 	)geh 	)iej 	)ke el 











Ie e e	P>>IOO,=,=>>>   " C" C" C" C" C" C" C" C"L 4   s  C        F 4 "    sm	
 3-      r   