
    SܶiQ                         d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
 d dlmZ ddlmZ  G d d          Z G d	 d
          Z G d d          ZdS )    N)Fraction)IteratorListMatchOptionalUnion)windowed   )remove_symbols_and_diacriticsc                   n     e Zd ZdZ fdZdee         dee         fdZdefdZ	defdZ
defd	Z xZS )
EnglishNumberNormalizerav  
    Convert any spelled-out numbers into arabic numbers, while handling:

    - remove any commas
    - keep the suffixes such as: `1960s`, `274th`, `32nd`, etc.
    - spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars`
    - spell out `one` and `ones`
    - interpret successive single-digit numbers as nominal: `one oh one` -> `101`
    c                    t                                                       h d| _        d t          g dd          D             | _        d | j                                        D             | _        ddd	d
dddd | j                                        D             | _        i | j        | j        | _        ddddddddd| _	        d | j	                                        D             | _
        d | j	                                        D             | _        i | j
        | j        | _        ddddddd d!d"d#d$d%d&| _        d' | j                                        D             | _        d( | j                                        D             | _        i | j        | j        | _        h | j        | j	        | j        | _        d)d)d*d*d+| _        d,d,d-d-d.d.d/d/d0| _        t)          t+          | j                                                  t+          | j                                                  z             | _        d1d2id2d3| _        h d4| _        t)          d5 | j        | j        | j        | j	        | j        | j        | j        | j        | j        | j        | j        fD                       | _        d6d7h| _        d S )8N>   oohzeroc                     i | ]\  }}||	S  r   ).0inames      ^/root/projects/openclaw-proxy/venv/lib/python3.11/site-packages/whisper/normalizers/english.py
<dictcomp>z4EnglishNumberNormalizer.__init__.<locals>.<dictcomp>   s.     
 
 
4 !
 
 
    )onetwothreefourfivesixseveneightnineteneleventwelvethirteenfourteenfifteensixteen	seventeeneighteennineteenr
   )startc                 4    i | ]\  }}|d k    rdn|dz   |dfS )r   sixessr   r   r   values      r   r   z4EnglishNumberNormalizer.__init__.<locals>.<dictcomp>6   sB     
 
 
e u}}GG$*ucl
 
 
r   )r   th)r
   st)   nd)   rd)   r3   )   r3   )zerothfirstsecondthirdfifthtwelfthc                 v    i | ]6\  }}|d k    |dk    |dk    ||                     d          rdndz   |df7S )r7   r9   r:   thr3   )endswithr1   s      r   r   z4EnglishNumberNormalizer.__init__.<locals>.<dictcomp>A   s]       D%199! t}}S11;t<udm0;r         (   2   <   F   P   Z   )twentythirtyfortyfiftysixtyseventyeightyninetyc                 F    i | ]\  }}|                     d d          |dfS )yiesr0   replacer1   s      r   r   z4EnglishNumberNormalizer.__init__.<locals>.<dictcomp>S   s=     
 
 
7BtUDLLe$$ucl
 
 
r   c                 F    i | ]\  }}|                     d d          |dfS )rV   iethr3   rX   r1   s      r   r   z4EnglishNumberNormalizer.__init__.<locals>.<dictcomp>V   s@     
 
 
e LLf%%t}
 
 
r   d     i@B i ʚ;l    J)l     I5 l     NZol     @=7M.cl      B3v^!< l      P ~cegl       73Me'l       (l
F3YHqS )hundredthousandmillionbilliontrillionquadrillionquintillion
sextillion
septillion	octillion	nonillion	decillionc                 $    i | ]\  }}|d z   |d fS r0   r   r1   s      r   r   z4EnglishNumberNormalizer.__init__.<locals>.<dictcomp>j   s3     #
 #
 #
)4uD3J#
 #
 #
r   c                 $    i | ]\  }}|d z   |d fS )r3   r   r1   s      r   r   z4EnglishNumberNormalizer.__init__.<locals>.<dictcomp>m   s3     $
 $
 $
+64D4K%$
 $
 $
r   -+)minusnegativepluspositive   £u   €$   ¢)poundpoundseuroeurosdollardollarscentcentsr|   %)perpercent>   andpointdoubletriplec                     g | ]	}|D ]}|
S r   r   )r   mappingkeys      r   
<listcomp>z4EnglishNumberNormalizer.__init__.<locals>.<listcomp>   sA        #      r   r   ones)super__init__zeros	enumerater   itemsones_pluralones_ordinalones_suffixedtenstens_pluraltens_ordinaltens_suffixedmultipliersmultipliers_pluralmultipliers_ordinalmultipliers_suffixeddecimalspreceding_prefixersfollowing_prefixerssetlistvaluesprefixes	suffixersspecialswordsliteral_words)self	__class__s    r   r   z EnglishNumberNormalizer.__init__   s   (((

 
$  * -  
 
 
	6
 
#y00
 
 

  !
 
 #'9??#4#4  
 G 0FD4EF 	
 	
	
 
FJiooFWFW
 
 

 
#y00
 
 
 G 0FD4EF  $)047;>BF
 
#
 #
8<8H8N8N8P8P#
 #
 #
$
 $
:>:J:P:P:R:R$
 $
 $
 %
%%
&%
! >$)=di=$*= 	$
 $
  	$
 	$
  )0022334+2244556
 

 C=
 
 =<<  JI&I&$-,,NM   
 

& $V_r   r   returnc              #     K   d d d}dt           fd}dt          t           t          f         ffd}t          |          dk    rd S t	          d g|z   d gz   d          D ]\  }}}|rd}|d uot          j        d|          }|d         | j        v }	|	r
|d	d          n|}
t          j        d|
          r ||
          }|J Wt          t                     r5	                    d
          r t                    t          |          z    |          V  |	r|d         n|j
        d	k    r|j        |
|| j        vr |          V   ||          V  || j        v rt          pd          dz   '|| j        v r| j        |         }|Ct          t                     s	|| j        v rZ|| j        v r0|dk     r*d         dk    sJ d d         t          |          z   t                    t          |          z   |dk     r1dz  dk    r|z  t                    t          |          z   dz  dk    r|z  t                    t          |          z   #|| j        v r|| j        |         \  }} |t          |          |z             V  nGt          t                     s	|| j        v rt|| j        v r=|dk     r7d         dk    sJ  |d d         t          |          z   |z             V  n |t                    t          |          z   |z             V  n|dk     rXdz  dk    r! |t          |z             |z             V  n |t                    t          |          z   |z             V  nWdz  dk    r! |t          |z             |z             V  n- |t                    t          |          z   |z             V  d || j        v rz| j        |         }|t          t                     r!t                    t          |          z   dz  dk    r|z  t                    t          |          z   ,|| j        v r| j        |         \  }} |t          |          |z             V  ft          t                     r/ |t                    t          |          z   |z             V  dz  dk    r" |t          |z             |z             V   |t                    t          |          z   |z             V  || j        v r| j        |         }| t          t                     sdk    r; |          }|||z  nd }||j
        d	k    r	|j        e |          V  |vdz  dz  }dz  }|||z  z   || j        v r| j        |         \  }} |t          |          |z             V  nt          t                     ro |          }|||z  nd }|.|j
        d	k    r# |t          |j                  |z             V  n] |          V   |t          |          |z             V  n2dz  dz  }dz  }|||z  z    |t                    |z             V  d || j        v r8 |          V  || j        v s|r| j        |          ||          V  || j        v r-| j        |          |          V   ||          V  || j        v r| j        |         }t          |t.                    rG||v r' |t                    ||         z             V  d}O |          V   ||          V  k |t                    |z             V   ||          V  || j        v r|| j        vr |s |          V   ||          V  |dk    r'|| j        vr |          V   ||          V  |dk    s|dk    r}|| j        v s	|| j        v rM|dk    rdnd}| j                            |d          }t          pd          t          |          |z  z   d}d |          V   ||          V  |dk    r!|| j        v s|rt          pd          d
z   t7          d|           t7          d|            |          V  d S d S )NFr0   c                 D    	 t          |           S # t          $ r Y d S w xY wN)r   
ValueErrorrk   s    r   to_fractionz:EnglishNumberNormalizer.process_words.<locals>.to_fraction   s5    {{"   tts    
resultc                 <    t          |           } | z   } d d | S r   )str)r   prefixr2   s    r   outputz5EnglishNumberNormalizer.process_words.<locals>.output   s,    [[F!&EFMr   r   r7   z^\d+(\.\d+)?$r
   . 0
   r\   r]   Tr   r   r   r5   r   zUnexpected token: )r   r   intlenr	   rematchr   
isinstancerD   denominator	numeratorr   r   r   r   r   r   r   r   r   r   r   dictr   getr   r   )r   r   skipr   r   prevcurrentnextnext_is_numeric
has_prefixcurrent_without_prefixfr   suffixr   
multiplierpbeforeresidualrepeatsr   r2   s                       @@r   process_wordsz%EnglishNumberNormalizer.process_words   s      $+/	3 	 	 	 		5c? 	 	 	 	 	 	 	 u::??F#+TFUNdV,CQ#G#G C	A C	AD'4 "$.S28<Ld3S3SO t}4J4>%KWQRR[[G"x(*@AA {AK 677}}}$!%-- ,%..2E2E , #E

S\\ 9 $fUmm+++'1=v=A%%KEE2EE
**$ &--'''fWoo%%%%DJ&&EKR((3.DI%%y)= EEs++ 7tty/@/@	))dRii$RyC//// %crc
SYY 6 #E

SYY 6BYYrzQ #E

SYY 6s{a'' #E

SYY 6D...#1':f= &TV!3444444s++ Ftty/@/@ty((TBYY$RyC////$fU3B3Z#d))%;f%DEEEEEE$fSZZ#d))%;f%DEEEEEEBYYrzQ$fS%6%6%?@@@@@@$fSZZ#d))%;f%DEEEEEEs{a''$fS%6%6%?@@@@@@$fSZZ#d))%;f%DEEEEEDI%%y)= EEs++ 7JJT2EEs{a'' #E

SYY 6D...#1':f= &TV!3444444s++ F &Uc$ii!7&!@AAAAAAs{a''$fS%6%6%?@@@@@@$fSZZ#d))%;f%DEEEEEED,,,!-g6
=&EEs++ ;uzz#E**A*+-JTA}!);); !$fUmm+++ *"d]T1F$t|H"X
%::EED555%)%>w%G"
F= &Z6!9::::::s++ 6#E**A*+-JTA}!););$fS%5%5%>??????$fUmm+++$fS__v%=>>>>>>"d]T1F$t|H"X
%::E &Uf!455555D444$ &--'''4:%%%!5g>FF &//))))D444$!5g>F &--'''' &//))))DN**$!^G4F!&$// :6>>"(&UfTl)B"C"CCCC#'DD"(&--///"(&//1111$fSZZ&%8999999 &//))))DM))tz))/)($fUmm+++ &//))))%%4#333 ,"(&--///$fWoo---((Gx,?,?ty((DDJ,>,>'.(':':!!#y}}T155 #EKR 0 03t99w3F F# ,"(&--///$fWoo----''t},,, #EKR 0 03 6 %%C'%C%CDDD !!?g!?!?@@@&-- r   r0   c                    g }t          j        d|          }t          |          D ]\  }}t          |                                          dk    r+|t          |          dz
  k    r|                    |           W|                    |           |                    d          d         }|| j        v s	|| j        v r|                    d           |                    d           d		                    |          }t          j
        d
d|          }t          j
        dd|          }t          j
        dd|          }|S )Nz\band\s+a\s+half\br   r
   r5   )maxsplitr   z
point fivez
and a half z([a-z])([0-9])z\1 \2z([0-9])([a-z])z([0-9])\s+(st|nd|rd|th|s)\b\1\2)r   splitr   r   stripappendrsplitr   r   joinsub)r   r0   resultssegmentsr   segment	last_words          r   
preprocessz"EnglishNumberNormalizer.preprocess  sA   81155#H-- 	1 	1JAw7==??##q((CMMA%%%w''''w'''#NNAN66r:	--d>N1N1NNN<0000NN<0000HHW F$h22F$h22 F17A>>r   c                     dt           fd}dt           fd}t          j        d||          }t          j        d||          }t          j        dd|          }|S )Nmc                     	 |                      d          }|                      d          }t          |                      d                    }| | d|dS # t          $ r
 | j        cY S w xY w)Nr
   r5   r7   r   02d)groupr   r   string)r   currencyintegerr}   s       r   combine_centsz:EnglishNumberNormalizer.postprocess.<locals>.combine_cents  s|     771::''!**AGGAJJ"9G99e9999      x s   AA A,+A,c                 |    	 dt          |                     d                     S # t          $ r
 | j        cY S w xY w)Nru   r
   )r   r   r   r   )r   s    r   extract_centsz:EnglishNumberNormalizer.postprocess.<locals>.extract_cents  sK     -C

OO---      x s   $' ;;u,   ([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\bu   [€£$]0.([0-9]{1,2})\bz	\b1(s?)\bzone\1)r   r   r   )r   r0   r   r   s       r   postprocessz#EnglishNumberNormalizer.postprocess  s}    	 U 	  	  	  	 	 U 	  	  	  	  FBMSTUUF.qAA F<1--r   c                     |                      |          }d                    d |                     |                                          D                       }|                     |          }|S )Nr   c              3      K   | ]}||V  	d S r   r   )r   words     r   	<genexpr>z3EnglishNumberNormalizer.__call__.<locals>.<genexpr>  s'      XXdtGWTGWGWGWGWXXr   )r   r   r   r   r   r   r0   s     r   __call__z EnglishNumberNormalizer.__call__  sa    OOAHHXXd&8&8&C&CXXXXXQr   )__name__
__module____qualname____doc__r   r   r   r   r   r   r   r   __classcell__)r   s   @r   r   r      s         L- L- L- L- L-\] 49 ] # ]  ]  ]  ] ~C    :S    2#        r   r   c                   $    e Zd ZdZd ZdefdZdS )EnglishSpellingNormalizerz~
    Applies British-American spelling mappings as listed in [1].

    [1] https://www.tysto.com/uk-us-spelling-list.html
    c                     t           j                            t           j                            t                    d          }t          j        t          |                    | _        d S )Nzenglish.json)	ospathr   dirname__file__jsonloadopenr   )r   mapping_paths     r   r   z"EnglishSpellingNormalizer.__init__  sB    w||BGOOH$=$=~NNyl!3!344r   r0   c                 j     d                      fd|                                D                       S )Nr   c              3   N   K   | ]}j                             ||          V   d S r   )r   r   )r   r   r   s     r   r   z5EnglishSpellingNormalizer.__call__.<locals>.<genexpr>  s5      KK((t44KKKKKKr   )r   r   r   s   ` r   r   z"EnglishSpellingNormalizer.__call__  s2    xxKKKKKKKKKKr   N)r   r   r   r   r   r   r   r   r   r   r   r     sO         5 5 5L# L L L L L Lr   r   c                        e Zd Zd ZdefdZdS )EnglishTextNormalizerc                 b   d| _         i dddddddd	d
ddddddddddddddddddddddd d!d"i d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMdNdOdPdLdQdRdS| _        t                      | _        t	                      | _        d S )TNz\b(hmm|mm|mhm|mmm|uh|um)\bz	\bwon't\bzwill notz	\bcan't\bzcan notz	\blet's\bzlet usz	\bain't\baintz	\by'all\bzyou allz	\bwanna\bzwant toz	\bgotta\bzgot toz	\bgonna\bzgoing toz\bi'ma\bzi am going toz\bimma\bz
\bwoulda\bz
would havez
\bcoulda\bz
could havez\bshoulda\bzshould havez	\bma'am\bmadamz\bmr\bzmister z\bmrs\bzmissus z\bst\bzsaint z\bdr\bzdoctor z\bprof\bz
professor z\bcapt\bzcaptain z\bgov\bz	governor z\bald\bz	alderman z\bgen\bzgeneral z\bsen\bzsenator z\brep\bzrepresentative z\bpres\bz
president z\brev\bz	reverend z\bhon\bz
honorable z\basst\bz
assistant z	\bassoc\bz
associate z\blt\bzlieutenant z\bcol\bzcolonel z\bjr\bzjunior z\bsr\bzsenior zesquire z	 had beenz	 has beenz	 had gonez	 has gonez	 had donez has gotz notz arez isz wouldz willz havez am)z\besq\bz	'd been\bz	's been\bz	'd gone\bz	's gone\bz	'd done\bz's got\bzn't\bz're\bz's\bz'd\bz'll\bz't\bz've\bz'm\b)ignore_patterns	replacersr   standardize_numbersr   standardize_spellings)r   s    r   r   zEnglishTextNormalizer.__init__  s#   <6
*6
 )6
 (	6

 &6
 )6
 )6
 (6
 *6
 6
 6
 <6
 <6
 M6
 '6
" y#6
$ 	%6
& x'6
 6
( y)6
* +6
, -6
. /6
0 16
2 
36
4 
56
6 )76
8 96
: ;6
< =6
> ?6
@ ,A6
B }C6
D 
E6
F yG6
H yI6
 6
J #%%%%%#k6
 6
 6
n $;#<#< %>%@%@"""r   r0   c                    |                                 }t          j        dd|          }t          j        dd|          }t          j        | j        d|          }t          j        dd|          }| j                                        D ]\  }}t          j        |||          }t          j        dd|          }t          j        dd	|          }t          |d
          }|                     |          }|                     |          }t          j        dd	|          }t          j        dd|          }t          j        dd|          }|S )Nz[<\[][^>\]]*[>\]]r   z\(([^)]+?)\)z\s+''z	(\d),(\d)r   z\.([^0-9]|$)z \1u
   .%$¢€£)keepu   [.$¢€£]([^0-9])z	([^0-9])%z\1 z\s+r   )	lowerr   r   r  r  r   r   r	  r
  )r   r0   patternreplacements       r   r   zEnglishTextNormalizer.__call__  s5   GGIIF'Q//F?B**F4'Q//F7C##$(N$8$8$:$: 	0 	0 G[wQ//AAF<!,,F?FA..)!,???$$Q''&&q)) F)6155F<++F63""r   N)r   r   r   r   r   r   r   r   r   r  r    sB        :A :A :Ax#      r   r  )r   r   r   	fractionsr   typingr   r   r   r   r   more_itertoolsr	   basicr   r   r   r  r   r   r   <module>r     s    				 				       9 9 9 9 9 9 9 9 9 9 9 9 9 9 # # # # # # 0 0 0 0 0 0s s s s s s s slL L L L L L L LU U U U U U U U U Ur   