
    bil&                    `   d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZmZ ddlmZmZmZ ddlmZ d	d
lmZmZmZmZ d	dlmZ  ej        e          Zg dZeg dz   Z ddZ!de"de#fdZ$ddee#         dz  fdZ% G d d          Z& G d de&          Z'de#de"fdZ( G d d          Z) G d de)          Z* G d d e)          Z+ G d! d"e)          Z, G d# d$e)          Z- G d% d&e)          Z. G d' d(e)          Z/ G d) d*e)          Z0 G d+ d,e)          Z1 G d- d.e)          Z2 G d/ d0e)          Z3 G d1 d2e)          Z4 G d3 d4e)          Z5 G d5 d6e5          Z6 G d7 d8e5          Z7 G d9 d:e5          Z8 G d; d<e5          Z9 G d= d>e5          Z: G d? d@e5          Z; G dA dBe5          Z< G dC dDe5          Z= G dE dFe5          Z> G dG dHe5          Z? G dI dJe5          Z@ G dK dLe5          ZA G dM dNe5          ZB G dO dPe5          ZC G dQ dRe5          ZD G dS dTe5          ZE G dU dVe)          ZF G dW dXe5          ZG G dY dZe)          ZH G d[ d\e)          ZI G d] d^e)          ZJ G d_ d`e5          ZK G da dbe5          ZL G dc dde5          ZM G de dfe)          ZN G dg dhe5          ZO G di dje5          ZP G dk dle5          ZQdm ZR G dn do          ZS G dp dq          ZTi dre6dse2dte7due*dveGdweJdxe8dyeHdze/d{e*d|e4d}e9d~e*de*de*de*de*i de6de,de/de0de*de*de2de>de2de2de*deNde:de;de-de*de2i de<de.deCde1de@deAde2de3de=de*deDdeEdeFde>de?de+deKeMeMeLeMdZUddefdZVdS )z
Utilities to convert slow tokenizers in their fast tokenizers counterparts.

All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
allow to make our dependency on SentencePiece optional.
    N)
Collection)	lru_cache)version)
AddedTokenRegex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPEUnigram	WordPiece)tqdm   )is_protobuf_availableis_sentencepiece_availableloggingrequires_backends)PROTOBUF_IMPORT_ERROR)ar_ARcs_CZde_DEen_XXes_XXet_EEfi_FIfr_XXgu_INhi_INit_ITja_XXkk_KZko_KRlt_LTlv_LVmy_MMne_NPnl_XXro_ROru_RUsi_LKtr_TRvi_VNzh_CN)af_ZAaz_AZbn_INfa_IRhe_ILhr_HRid_IDka_GEkm_KHmk_MKml_INmn_MNmr_INpl_PLps_AFpt_XXsv_SEsw_KEta_INte_INth_THtl_XXuk_UAur_PKxh_ZAgl_ESsl_SI c                    t                      rddlm} |S t                      rGdd l}t          j        |j        j                  t          j        d          k     rddl	m} nddl	m
} |S t          t          j        |                     )Nr   )sentencepiece_model_pb2z4.0.0)sentencepiece_model_pb2_new)r   sentencepiecerM   r   google.protobufr   parseprotobuf__version__transformers.utilsrN   ImportErrorr   format)error_messagerM   googles      ^/root/projects/butler/venv/lib/python3.11/site-packages/transformers/convert_slow_tokenizer.pyimport_protobufrZ   _   s    !## '999999&& 	G=455g8N8NNNBBBBBBBaaaaaa&&/6}EEFFF    add_prefix_spacereturnc                 :    | rd}t          |dd          sd}nd}|S )NalwayslegacyTfirstnever)getattr)r\   original_tokenizerprepend_schemes      rY   _get_prepend_schemerf   p   s5     !!)8T:: 	%$N r[   skip_tokensc                     |t          |          nt                      }|d u}|rt          |          n }g }|                                D ]\  }}||v r
g }t          dt	          |                    D ]?}|d |         ||d          }
}	|	|v s|
|v r|	 v r|
 v r|                    |	|
|f           @t          | fd          }|                    |           t          |d |          }d |D             }|S )Nr   c                 <    | d                  | d                  fS Nr   r    )xvocabs    rY   <lambda>z!generate_merges.<locals>.<lambda>   s    U1Q4[%!+,F r[   keyc                 d    | d         t          | d                   t          | d                   fS )N   r   r   )lenvals    rY   rn   z!generate_merges.<locals>.<lambda>   s%    SVSQ[[#c!f++,N r[   rp   reversec                 .    g | ]}|d          |d         fS r   r   rk   ).0ru   s     rY   
<listcomp>z#generate_merges.<locals>.<listcomp>   s%    1113s1vs1v111r[   )setdictitemsrangers   appendsortedextend)rm   vocab_scoresrg   rw   mergesmergepiece_scorelocalindexpiece_lpiece_rs   `          rY   generate_mergesr   z   sX   &1&=#k"""355K$&G)0;4%%%eLF*0022  {K1c%jj)) 	> 	>E$VeV}eEFFmWG+%%K)?)?%Gu$4$4gw<===u"F"F"F"FGGGeF N NX_```F11&111FMr[   c                   V    e Zd ZdZdefdZdeeeef         e	e         f         fdZ
dS )SentencePieceExtractorzl
    Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
    modelc                 6   t          | d           t          | d           t                      }|                                }t          |d          5 }|                    |                                           d d d            n# 1 swxY w Y   || _        d S )NrO   rR   rb)r   rZ   
ModelProtoopenParseFromStringreadproto)selfr   	model_pb2mfs        rY   __init__zSentencePieceExtractor.__init__   s    $000$
+++ $%%	  ""% 	(!affhh'''	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	(


s   (BBBr]   c                    | j         j        j         |!ddlm}m} | j         j        j        dk    r|n|}d | j         j        D             }|j        dk    r| j         j        j        |d<   ||d<   n4dd	l	m
} d
 t          |          D             } ||          }||d<   ||d<   d t          | j         j                  D             }d t          |d           D             |d<   |S )
        By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
        order the merges with respect to the piece scores instead.
        Nr   )r   r   r   c                 *    g | ]}|j         |j        fS rk   piecescorerz   r   s     rY   r{   z2SentencePieceExtractor.extract.<locals>.<listcomp>   s!    KKK%+u{+KKKr[   r   unk_idrm   )r   c                      i | ]\  }\  }}||S rk   rk   rz   iwordr   s       rY   
<dictcomp>z2SentencePieceExtractor.extract.<locals>.<dictcomp>   s#    FFF!1MT5T1FFFr[   r   c                 L    g | ]!\  }}|j         d v ||j        |j         dk    f"S )      r   )typer   )rz   idps      rY   r{   z2SentencePieceExtractor.extract.<locals>.<listcomp>   s:    uuu52qdedjntdtdtR!&A+6dtdtdtr[   c                 :    g | ]\  }}}t          |d |          S F
normalizedspecialr   rz   r   tokenr   s       rY   r{   z2SentencePieceExtractor.extract.<locals>.<listcomp>   s=     /
 /
 /
"E7 u@@@/
 /
 /
r[   c                     | d         S Nr   rk   rl   s    rY   rn   z0SentencePieceExtractor.extract.<locals>.<lambda>   s
    QqT r[   ro   additional_special_tokens)r   trainer_specr   tokenizers.modelsr   r   
model_typepieces__name__tokenization_utils_baser   	enumerater   )	r   r   kwargsr   r   rm   r   r   spm_added_tokenss	            rY   extractzSentencePieceExtractor.extract   s=   
 	
&&66666666$(J$;$F!$K$KQTJKK9JKKK%''#z6=F8#F7OO@@@@@@FFYu5E5EFFFE$_U++F#F7O%F8 vuIdjN_D`D`uuu/
 /
&,-=>>&R&R&R/
 /
 /
*+ r[   N)r   
__module____qualname____doc__strr   tupler}   intlistr   rk   r[   rY   r   r      sj         
c 
 
 
 
 uT#s(^T%[5P/Q            r[   r   c                   H    e Zd Zddeeeef         ee         f         fdZdS )GemmaSentencePieceExtractorNr]   c                     | j         fdt                                                    D             }d|vr|                    d          |d<   t	          ||          }||fS )r   c                 <    i | ]}                     |          |S rk   )id_to_piece)rz   r   sps     rY   r   z7GemmaSentencePieceExtractor.extract.<locals>.<dictcomp>   s'    TTT%&&TTTr[   	<0x09>)r   r   GetPieceSizegetr   )r   r   rm   r   r   s       @rY   r   z#GemmaSentencePieceExtractor.extract   sr    
 WTTTT5ARAR;S;STTT u))H--E$K 55f}r[   N)	r   r   r   r   r}   r   r   r   r   rk   r[   rY   r   r      sJ         E$sCx.$u+2M,N      r[   r   r   c                 v    t          |           dk     p&| d         dk    p| d                                          S )Nrr   ,)rs   isdigit)r   s    rY   check_number_commar      s8    u::>HU2Y#-HU2Y5F5F5H5H1HHr[   c                        e Zd Zd ZdefdZdS )	Converterc                     || _         d S r   )rd   )r   rd   s     rY   r   zConverter.__init__   s    "4r[   r]   c                     t                      r   )NotImplementedErrorr   s    rY   	convertedzConverter.converted   s    !###r[   N)r   r   r   r   r   r   rk   r[   rY   r   r      s>        5 5 5$9 $ $ $ $ $ $r[   r   c                       e Zd ZdefdZdS )BertConverterr]   c           	         | j         j        }t          t          |t	          | j         j                                      }d}d}d}t          | j         d          r3| j         j        j        }| j         j        j	        }| j         j        j
        }t          j        d|||          |_        t          j                    |_        t	          | j         j                  }t	          | j         j                  }| j         j        }| j         j        }	t+          j        | d| d| d| d| d	||f||	fg
          |_        t1          j        d          |_        |S )N	unk_tokenFbasic_tokenizerT
clean_texthandle_chinese_charsstrip_accents	lowercase:0 $A:0 :0:0 $B:1 :1singlepairspecial_tokens##prefixrd   rm   r   r   r   r   hasattrr   tokenize_chinese_charsr   do_lower_caser
   BertNormalizer
normalizerr   BertPreTokenizerpre_tokenizer	cls_token	sep_tokencls_token_idsep_token_idr   TemplateProcessingpost_processorr	   decoder
r   rm   	tokenizerr   r   r   clssepr  r  s
             rY   r   zBertConverter.converted   y   '-iT=T=^9_9_```aa	!&4*,=>> 	R%)%<%L%c" 3CQM 3CQM*9!7'#	 
  
  
	 #1"A"C"C	$)344$)344.;.;#-#@**3***5555c555l#l#$
 $
 $
	  %.d;;;	r[   Nr   r   r   r   r   rk   r[   rY   r   r      /        #9 # # # # # #r[   r   c                       e Zd ZdefdZdS )SplinterConverterr]   c           
         | j         j        }t          t          |t	          | j         j                                      }d}d}d}t          | j         d          r3| j         j        j        }| j         j        j	        }| j         j        j
        }t          j        d|||          |_        t          j                    |_        t	          | j         j                  }t	          | j         j                  }t	          | j         j                  }d}	| j         j        }
| j         j        }| j         j        }| j                             d          }| j         j        dk    r| d| d	|	 d	| d
| d
}n| d| d
| d	|	 d	| d
}t3          j        | d| d|||
f||f||f|	|fg          |_        t9          j        d          |_        |S )Nr   Fr   Tr   .rightr    r   r   r   r   r   r   )rd   rm   r   r   r   r   r   r   r   r   r   r
   r   r   r   r   r  r  r  question_tokenr  r  question_token_idconvert_tokens_to_idspadding_sider   r  r  r	   r  )r   rm   r
  r   r   r   r  r  questiondotr  r  r  dot_token_idr   s                  rY   r   zSplinterConverter.converted  s   '-iT=T=^9_9_```aa	!&4*,=>> 	R%)%<%L%c" 3CQM 3CQM*9!7'#	 
  
  
	 #1"A"C"C	$)344$)344t.=>>.;.; 3E.DDSII"/7::HH8HHcHHCHHHHHDDHH3HHHH3HHHHHD#-#@**3***l#l#,-l#		$
 	$
 	$
	  %.d;;;	r[   Nr  rk   r[   rY   r  r  
  s/        .9 . . . . . .r[   r  c                       e Zd ZdefdZdS )FunnelConverterr]   c           	         | j         j        }t          t          |t	          | j         j                                      }d}d}d}t          | j         d          r3| j         j        j        }| j         j        j	        }| j         j        j
        }t          j        d|||          |_        t          j                    |_        t	          | j         j                  }t	          | j         j                  }| j         j        }| j         j        }	t+          j        | d| d| d| d| d	||f||	fg
          |_        t1          j        d          |_        |S )Nr   Fr   Tr   z:2 $A:0 r   r   r   r   r   r   r   r	  s
             rY   r   zFunnelConverter.converted=  r  r[   Nr  rk   r[   rY   r  r  <  r  r[   r  c                       e Zd ZdefdZdS )MPNetConverterr]   c                    | j         j        }t          t          |t	          | j         j                                      }d}d}d}t          | j         d          r3| j         j        j        }| j         j        j	        }| j         j        j
        }t          j        d|||          |_        t          j                    |_        t	          | j         j                  }t	          | j         j                  }| j         j        }| j         j        }	t+          j        | d| d| d| d| d	| d
||f||	fg          |_        t1          j        d          |_        |S )Nr   Fr   Tr   r   r   z:0 r   r   r   r   r   r   r	  s
             rY   r   zMPNetConverter.convertedd  s   '-iT=T=^9_9_```aa	!&4*,=>> 	R%)%<%L%c" 3CQM 3CQM*9!7'#	 
  
  
	 #1"A"C"C	$)344$)344.;.;#-#@**3***======c===l#l#$
 $
 $
	  %.d;;;	r[   Nr  rk   r[   rY   r!  r!  c  r  r[   r!  c                       e Zd ZdefdZdS )OpenAIGPTConverterr]   c           
         | j         j        }t          | j         j                                                  }| j         j        }t          t          ||d t          |          dd                    }|	                    t          |                    #|
                    t          |          g           t          j        d          |_        t          j                    |_        t#          j        d          |_        |S )N</w>F)rm   r   dropoutr   end_of_word_suffixfuse_unkT)r   suffix)rd   encoderr   	bpe_rankskeysr   r   r   r   token_to_idadd_special_tokensr
   r   r   r   r   r  r	   
BPEDecoderr  r   rm   r   r   r
  s        rY   r   zOpenAIGPTConverter.converted  s    '/d-7<<>>??+5	i..#)  	
 	
	   Y00<((#i..)9:::*9DIII	"0"A"C"C	$/v>>>	r[   Nr  rk   r[   rY   r$  r$    s/        9      r[   r$  c                   \    e Zd Zddeeef         dz  deeeef                  dz  defdZ	dS )GPT2ConverterNrm   r   r]   c           
         |s| j         j        }|st          | j         j                  }t	          t          ||d ddd                    }t          | j         dd          }t          j        |          |_	        t          j                    |_        t          | j         dd          r>| j         j        }| j         j        }t          j        | d| d||fg	          |_        nt          j        d
          |_        |S )NrK   Frm   r   r'  continuing_subword_prefixr(  r)  r\   r\   add_bos_tokenz:0 $A:0z:0 $A:0 $B:1r   trim_offsets)rd   r,  r   r-  r   r   rc   r   	ByteLevelr  r	   r  	bos_tokenbos_token_idr   r  r  )r   rm   r   r
  r\   bosr>  s          rY   r   zGPT2Converter.converted  s/    	4+3E 	=$1;<<F*,#%  	
 	
	 #4#:<NPUVV"0":L\"]"]"]	$.00	4*OUCC 	P)3C2?L'1'D))),' ( ( (I$$ (2';'O'O'OI$r[   NN
r   r   r   r}   r   r   r   r   r   r   rk   r[   rY   r4  r4    sf        " "tCH~4 "T%PSUXPX/EZ]aEa "mv " " " " " "r[   r4  c                       e Zd ZdefdZdS )HerbertConverterr]   c           	      .   d}d}| j         j        }t          | j         j                                                  }||d         d         v r
|dd          }t          t          ||d | j         j        |                    }t          j	        dd          |_
        t          j                    |_        t          j        |          |_        t#          j        | j         j        | j         j        f| j         j        | j         j        f	          |_        |S )
Nz	#version:r&  r   r   )r'  r   r(  F)r   r   r*  )r  r  )rd   r,  r   r-  r.  r   r   r   r
   r   r   r   r   r  r	   r1  r  r   BertProcessingr  r  r  r  r  )r   tokenizer_info_strtoken_suffixrm   r   r
  s         rY   r   zHerbertConverter.converted  s   ('/d-7<<>>??1--ABBZF1;#/  
 
	  +9EY^___	"0"A"C"C	$/|DDD	#-#<(2D4K4XY(2D4K4XY$
 $
 $
	 
 r[   Nr  rk   r[   rY   rC  rC    /        9      r[   rC  c                   \    e Zd Zddeeef         dz  deeeef                  dz  defdZ	dS )Qwen2ConverterNrm   r   r]   c                 "   |s| j         j        }|s+t          | j         j                                                  }t          t          ||d d dddd                    }t          j                    |_	        t          j        t          j        t          d          dd          t          j        t          | j         dd          d          g          |_        t#          j                    |_        t'          j        d	          |_        |S )
NrK   F)rm   r   r'  r   r7  r(  r)  byte_fallbackzn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+isolatedbehaviorinvertr\   r\   	use_regexr:  )rd   r,  r   r-  r.  r   r   r
   NFCr   r   SequenceSplitr   r<  rc   r  r	   r  r   r  )r   rm   r   r
  s       rY   r   zQwen2Converter.converted  s/    	4+3E 	D$1;@@BBCCF*,#%#	 	 	
 
	  +00	"0"9$ N  (    (%,T-DFXZ_%`%`#  #
 #
	  %.00	#-#7U#K#K#K	 r[   r@  rA  rk   r[   rY   rJ  rJ    sf        ( (tCH~4 (T%PSUXPX/EZ]aEa (mv ( ( ( ( ( (r[   rJ  c                       e Zd ZdefdZdS )RobertaConverterr]   c           
         | j         }|j        }t          |j                                                  }t          t          ||d ddd                    }t          j        |j	                  |_
        t          j                    |_        t          j        |j        |j        f|j        |j        f|j	        d          |_        |S )NrK   Fr6  r8  Tr  r  r\   r;  )rd   r,  r   r-  r.  r   r   r   r<  r\   r  r	   r  r   RobertaProcessingr  r  r  r  r  r   otrm   r   r
  s        rY   r   zRobertaConverter.converted  s    $
bl''))***,#%  	
 	
	 #1":BL_"`"`"`	$.00	#-#?r/r/0	$
 $
 $
	  r[   Nr  rk   r[   rY   rW  rW    /        9      r[   rW  c                       e Zd ZdefdZdS )RoFormerConverterr]   c           	         ddl m} | j        j        }t	          t          |t          | j        j                                      }d}d}t          | j        d          r"| j        j	        j
        }| j        j	        j        }t          j        dd||          |_        t          j                             ||                    |_        t          | j        j                  }t          | j        j                  }| j        j        }| j        j        }	t/          j        | d| d	| d| d
| d||f||	fg          |_        t5          j        d          |_        |S )Nr   )JiebaPreTokenizerr   Fr   Tr   r   r   r   r   r   r   r   )"models.roformer.tokenization_utilsra  rd   rm   r   r   r   r   r   r   r   r   r
   r   r   r   PreTokenizercustomr  r  r  r  r  r   r  r  r	   r  )
r   ra  rm   r
  r   r   r  r  r  r  s
             rY   r   zRoFormerConverter.converted4  s   IIIIII'-iT=T=^9_9_```aa	4*,=>> 	R 3CQM 3CQM*9!&'#	 
  
  
	 #1"="D"DEVEVW\E]E]"^"^	$)344$)344.;.;#-#@**3***5555c555l#l#$
 $
 $
	  %.d;;;	r[   Nr  rk   r[   rY   r_  r_  3  r  r[   r_  c                       e Zd ZdefdZdS )DebertaConverterr]   c           
         | j         }|j        }t          |j                                                  }t          t          ||d ddd                    }t          j        |j	                  |_
        t          j                    |_        t          j        ddd| j                             d          fd| j                             d          fg	          |_        |S )
NrK   Fr6  r8  [CLS]:0 $A:0 [SEP]:0![CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1[CLS][SEP]r   )rd   r,  r   r-  r.  r   r   r   r<  r\   r  r	   r  r   r  r  r  r[  s        rY   r   zDebertaConverter.converted[  s    $
bl''))***,#%  	
 	
	 #1":BL_"`"`"`	$.00	#-#@)4$1GGPPQ$1GGPPQ$
 $
 $
	  r[   Nr  rk   r[   rY   rf  rf  Z  rH  r[   rf  c                   x     e Zd ZdZeZi Zedd            Z fdZ	d Z
d Zd Zd Zd	 Zd
 Zd ZdefdZ xZS )SpmConverterFNc                     |||d<   |S )z
        Hook used when converting directly from a SentencePiece model without a slow tokenizer instance.
        By default, return kwargs unchanged.
        Nrm   rk   )r  rm   r   s      rY   convert_from_spmzSpmConverter.convert_from_spm~  s     #F7Or[   c                    t          | d            t                      j        |  t                      }|                                }t          | j        j        d          5 }|                    |	                                           d d d            n# 1 swxY w Y   || _
        | j
        j        j        r| j        st          j        d           d S d S d S )NrR   r   a  The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.)r   superr   rZ   r   r   rd   
vocab_filer   r   r   r   rL  handle_byte_fallbackwarningswarn)r   argsr   r   r   	__class__s        rY   r   zSpmConverter.__init__  s"   $
+++$ $%%	  ""$)4d;; 	(qaffhh'''	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	(
:"0 	9R 	Me    	 	 	 	s   $(BBBc                 $    d |j         D             S )Nc                 *    g | ]}|j         |j        fS rk   r   r   s     rY   r{   z&SpmConverter.vocab.<locals>.<listcomp>  s!    EEEuek*EEEr[   r   r   r   s     rY   rm   zSpmConverter.vocab  s    EEEEEEr[   c                     |j         j        S r   )r   r   r{  s     rY   r   zSpmConverter.unk_id  s    !((r[   c           
          |j         j        }                     |          }|dk    r8t          t	          |                     |           j                            }n|dk    r                      j        j	                  
                    |          \  }}d t          |          D             }t          t          |||j         j        d j        d                     }nt          d           fdt          |j                  D             }|                    d	 t#          |d
           D                        |S )Nr   r   rL  rr   c                      i | ]\  }\  }}||S rk   rk   r   s       rY   r   z*SpmConverter.tokenizer.<locals>.<dictcomp>  s#    QQQ%5QuqQQQr[   Tr   r)  rL  r'  z]You're trying to run a `Unigram` model but you're file was trained with a different algorithmc                 j    g | ]/\  }}|j         d v ||j        |j         dk    p|j        j        v f0S r   r   r   r   rz   r   r   r   s      rY   r{   z*SpmConverter.tokenizer.<locals>.<listcomp>  R     
 
 
Av !&A+GD4G)GHr[   c                 :    g | ]\  }}}t          |d |          S r   r   r   s       rY   r{   z*SpmConverter.tokenizer.<locals>.<listcomp>  =       &Bw 5UGDDD  r[   c                     | d         S r   rk   r   s    rY   rn   z(SpmConverter.tokenizer.<locals>.<lambda>      QRSTQU r[   ro   )r   r   rm   r   r   r   rs  SpmExtractorrd   rr  r   r   r   	unk_piece	Exceptionr   
add_tokensr   )	r   r   r   r   r
  _r   	bpe_vocabr   s	   `        rY   r
  zSpmConverter.tokenizer  s   '2
zz%((??! ;;u--"&";   II 1__))$*A*LMMUUVbccIAvQQ<9P9PQQQI!#0:!"&";   	 	II o  
 
 
 
"5<00
 
 

 	 *01A~~*V*V*V  	
 	
 	
 r[   c                 
   |j         j        }t          j        dd          t          j        t          d          d          g}|st          j        |          S t          j        t          j        |          g|z             S )NFT)leftr   {2,}   ▁)normalizer_specprecompiled_charsmapr
   StripReplacer   rT  Precompiledr   r   r  _normalizerss       rY   r   zSpmConverter.normalizer  s    $4I5555g66
 $ 	h'555')@AU)V)V(WZf(fgggr[   c                 X    t          || j                  }t          j        ||          S Nreplacementre   )rf   rd   r   	Metaspacer   r  r\   re   s       rY   r  zSpmConverter.pre_tokenizer  s,    ,-=t?VWW'KP^____r[   c                     d S r   rk   r   s    rY   r  zSpmConverter.post_processor  s    tr[   c                 X    t          || j                  }t          j        ||          S r  )rf   rd   r	   r  r  s       rY   r  zSpmConverter.decoder  s+    ,-=t?VWW!k.YYYYr[   r]   c                 x   |                      | j                  }|                     | j                  }|||_        d}d}t          | j        d          r| j        j        }|                     ||          }|||_        |                     ||          |_        |                                 }|r||_        |S )Nr  Tr\   )	r
  r   r   r   rd   r\   r  r  r  )r   r
  r   r  r\   r  r  s          rY   r   zSpmConverter.converted  s    NN4:..	 __TZ00
!#-I 4*,>?? 	H#6G**;8HII$&3I# LL6FGG	,,.. 	6'5I$r[   r   )r   r   r   rs  r   r  r   classmethodro  r   rm   r   r
  r   r  r  r  r   r   __classcell__)rw  s   @rY   rm  rm  y  s         )LN   [    *F F F) ) )0 0 0d	h 	h 	h` ` `  Z Z Z9        r[   rm  c                        e Zd Zd Zd Zd ZdS )AlbertConverterc                 $    d |j         D             S )Nc                 t    g | ]5}t          |j                  r|j        |j        fn|j        |j        d z
  f6S d   r   r   r   r   s     rY   r{   z)AlbertConverter.vocab.<locals>.<listcomp>  X     
 
 
 +=U[*I*IoU[%+&&PUP[]b]hkn]nOo
 
 
r[   rz  r{  s     rY   rm   zAlbertConverter.vocab  %    
 

 
 
 	
r[   c                 f   t          j        dd          t          j        dd          g}| j        j        sL|                    t          j                               |                    t          j                               | j        j        r&|                    t          j                               |j	        j
        }|r'|                    t          j        |                     |                    t          j        t          d          d                     t          j        |          S Nz``"z''r  r  r
   r  rd   keep_accentsr   NFKDStripAccentsr   	Lowercaser  r  r  r   rT  r   r   list_normalizersr  s       rY   r   zAlbertConverter.normalizer     c**c**
 &3 	@##K$4$6$6777##K$<$>$>???"0 	=##K$9$;$;<<<$4I 	S##K$;<P$Q$QRRR 3E'NNC H HIII#$4555r[   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S Nrh  ri  rj  rk  r   r   r  rd   r  r   s    rY   r  zAlbertConverter.post_processor  Y    ,)4$1GGPPQ$1GGPPQ
 
 
 	
r[   Nr   r   r   rm   r   r  rk   r[   rY   r  r    A        
 
 
6 6 6&
 
 
 
 
r[   r  c                       e Zd Zd Zd ZdS )BarthezConverterc                 
    d}|S Nr   rk   r   r   r   s      rY   r   zBarthezConverter.unk_id*      r[   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S Nz<s> $A </s>z<s> $A </s> </s> $B </s><s></s>r   r  r   s    rY   r  zBarthezConverter.post_processor.  Y    , +/EEeLLM0FFvNNO
 
 
 	
r[   N)r   r   r   r   r  rk   r[   rY   r  r  )  s2          
 
 
 
 
r[   r  c                   8    e Zd Zd Zd Zd Zedd            ZdS )CamembertConverterc                 R    g d}|d |j         dd          D             z  }|dgz  }|S )N)z
<s>NOTUSED        <pad>r  z</s>NOTUSEDr  <unk>r  )<unk>NOTUSEDic                 *    g | ]}|j         |j        fS rk   r   r   s     rY   r{   z,CamembertConverter.vocab.<locals>.<listcomp>C  !    KKK5;,KKKr[   r   <mask>r  rz  r   r   rm   s      rY   rm   zCamembertConverter.vocab:  sK    
 
 
 	KK%,qrr:JKKKK/""r[   c                     dS r  rk   r{  s     rY   r   zCamembertConverter.unk_idG  s    qr[   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S r  r  r   s    rY   r  z!CamembertConverter.post_processorK  r  r[   Nc                 ~   t          |                    dd                    }t          |                    dd                    }t          |                    dd                    }d|dfd	|dfd
g}|*|                    t          |          dd                     |                    |df           ||d<   |S )N	pad_tokenr  r   r  
mask_tokenr  r  r  r  )r        Yr   rm   r   r   r   r   r   )r  rm   r   r  r   r  
vocab_lists          rY   ro  z#CamembertConverter.convert_from_spmU  s    

;8899	

;8899	L(;;<<
   $

 d5kk!""o...:s+,,,$wr[   r   r   r   r   rm   r   r  r  ro  rk   r[   rY   r  r  9  sa            
 
 
    [  r[   r  c                        e Zd Zd Zd Zd ZdS )DebertaV2Converterc                    g }| j         j        r(|                    t          j        d                     t          || j                   }|                    t          j        ||                     t          j        |          S )NrM  )rO  r  )rd   split_by_punctr   r   Punctuationrf   r  rT  )r   r  r\   list_pretokenizersre   s        rY   r  z DebertaV2Converter.pre_tokenizerj  s    "1 	W%%n&@*&U&U&UVVV,-=t?VWW!!.":{cq"r"r"rsss&'9:::r[   c                    g }| j         j        r&|                    t          j                               |                    t          j                               |j        j        }|r'|                    t          j        |                     |                    t          j	        t          d          d                     t          j        |          S )Nr  r  )rd   r   r   r
   r  r  r  r  r  r  r   rT  r  s       rY   r   zDebertaV2Converter.normalizerr  s    "0 	=##K$9$;$;<<< 1 3 3444$4I 	S##K$;<P$Q$QRRR 3E'NNC H HIII#$4555r[   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S r  r  r   s    rY   r  z!DebertaV2Converter.post_processor  r  r[   N)r   r   r   r  r   r  rk   r[   rY   r  r  i  sA        ; ; ;6 6 6
 
 
 
 
r[   r  c                   8    e Zd Zd Zd Zd Zedd            ZdS )MBartConverterc                 `    g d}|d |j         dd          D             z  }|g dz  }|dgz  }|S )Nr  r  r  r  r  r  c                 *    g | ]}|j         |j        fS rk   r   r   s     rY   r{   z(MBartConverter.vocab.<locals>.<listcomp>  r  r[   r   )r   r  r   r  r   r  r   r  r   r  r   r  r   r  r   r  r   r  r    r  r!   r  r"   r  r#   r  r$   r  r%   r  r&   r  r'   r  r(   r  r)   r  r*   r  r+   r  r,   r  r-   r  r.   r  r/   r  r  rz  r  s      rY   rm   zMBartConverter.vocab  sf    
 
 
 	KK%,qrr:JKKKK 
 
 
 	
6 	/""r[   c                     dS r  rk   r{  s     rY   r   zMBartConverter.unk_id      qr[   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S )Nz$A </s> en_XXz$A $B </s> en_XXr   r  r   r  r   s    rY   r  zMBartConverter.post_processor  Y    ,"#$1GGPPQ0FFvNNO
 
 
 	
r[   Nc                 X   t          |                    dd                    }t          |                    dd                    }t          |                    dd                    }t          |                    dd                    }t          |                    d	d
                    }|df|df|df|dfg}|*|                    t          |          dd                     |                    d t          D                        |                    |df           ||d<   |S )Nr=  r  r  r  	eos_tokenr  r   r  r  r  r  r   c              3      K   | ]}|d fV  	dS r  Nrk   rz   	lang_codes     rY   	<genexpr>z2MBartConverter.convert_from_spm.<locals>.<genexpr>  s'      LLy9c*LLLLLLr[   rm   )r   r   r   r   MBART_LANGUAGESr   	r  rm   r   r=  r  r  r   r  r  s	            rY   ro  zMBartConverter.convert_from_spm  s&   

;6677	

;8899	

;7788	

;8899	L(;;<<
 	

 d5kk!""o...LLOLLLLLL:s+,,,$wr[   r   r  rk   r[   rY   r  r    sb        $ $ $L  
 
 
    [  r[   r  c                   8    e Zd Zd Zd Zd Zedd            ZdS )MBart50Converterc                 `    g d}|d |j         dd          D             z  }|g dz  }|dgz  }|S )Nr  c                 *    g | ]}|j         |j        fS rk   r   r   s     rY   r{   z*MBart50Converter.vocab.<locals>.<listcomp>  r  r[   r   )4r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r  r  r  r  r  r	  r
  )r0   r  )r1   r  )r2   r  )r3   r  )r4   r  )r5   r  )r6   r  )r7   r  )r8   r  )r9   r  )r:   r  )r;   r  )r<   r  )r=   r  )r>   r  )r?   r  )r@   r  )rA   r  )rB   r  )rC   r  )rD   r  )rE   r  )rF   r  )rG   r  )rH   r  )rI   r  )rJ   r  r  rz  r  s      rY   rm   zMBart50Converter.vocab  sh    
 
 
 	KK%,qrr:JKKKK  R  R  R  	R/""r[   c                     dS r  rk   r{  s     rY   r   zMBart50Converter.unk_id  r  r[   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S )Nzen_XX $A </s>zen_XX $A $B </s>r   r  r   r  r   s    rY   r  zMBart50Converter.post_processor  r  r[   Nc                 X   t          |                    dd                    }t          |                    dd                    }t          |                    dd                    }t          |                    dd                    }t          |                    d	d
                    }|df|df|df|dfg}|*|                    t          |          dd                     |                    d t          D                        |                    |df           ||d<   |S )Nr  r  r  r  r  r  r   r  r  r  r  r   c              3      K   | ]}|d fV  	dS r  rk   r  s     rY   r  z4MBart50Converter.convert_from_spm.<locals>.<genexpr>  s'      NNy9c*NNNNNNr[   rm   )r   r   r   r   MBART50_LANGUAGESr   )	r  rm   r   r  r  r  r   r  r  s	            rY   ro  z!MBart50Converter.convert_from_spm  s'   

;6677	

;8899	

;7788	

;8899	L(;;<<
 	

 d5kk!""o...NN<MNNNNNN:s+,,,$wr[   r   r  rk   r[   rY   r  r    sa        
 
 
  
 
 
    [  r[   r  c                   8    e Zd Zd Zd Zd Zedd            ZdS )NllbConverterc                 F    g d}|d |j         dd          D             z  }|S )Nr  c                 *    g | ]}|j         |j        fS rk   r   r   s     rY   r{   z'NllbConverter.vocab.<locals>.<listcomp>  r  r[   r   rz  r  s      rY   rm   zNllbConverter.vocab  >    
 
 
 	KK%,qrr:JKKKKr[   c                     dS r  rk   r{  s     rY   r   zNllbConverter.unk_id  r  r[   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S )Nzeng_Latn $A </s>zeng_Latn $A $B </s>eng_Latnr  r   r  r   s    rY   r  zNllbConverter.post_processor  sY    ,%&T4JJ:VVW0FFvNNO
 
 
 	
r[   Nc                    t          |                    dd                    }t          |                    dd                    }t          |                    dd                    }t          |                    dd                    }|d	|d
|d|di}|Qt          |t                    r|                                nd |D             }|D ]}	|	|v rt          |          ||	<   ||d<   |S )Nr=  r  r  r  r  r  r   r  r   r   rr   r   c                     g | ]\  }}|S rk   rk   )rz   tokr  s      rY   r{   z2NllbConverter.convert_from_spm.<locals>.<listcomp>*  s    B[B[B[63PQ3B[B[B[r[   rm   )r   r   
isinstancer}   r.  rs   )
r  rm   r   r=  r  r  r   reordered_vocabtokensr   s
             rY   ro  zNllbConverter.convert_from_spm  s   

;6677	

;8899	

;7788	

;8899	 qqqq	
 %/t%<%<[UZZ\\\B[B[UZB[B[B[F > >O++),_)=)=&&)wr[   r   r  rk   r[   rY   r"  r"    sa            
 
 
    [  r[   r"  c                        e Zd Zd Zd Zd ZdS )SeamlessM4TConverterc                 F    g d}|d |j         dd          D             z  }|S )N)r  r  r  r  c                 *    g | ]}|j         |j        fS rk   r   r   s     rY   r{   z.SeamlessM4TConverter.vocab.<locals>.<listcomp>;  r  r[   r   rz  r  s      rY   rm   zSeamlessM4TConverter.vocab4  r%  r[   c                     | j         j        S r   )rd   unk_token_idr{  s     rY   r   zSeamlessM4TConverter.unk_id>  s    &33r[   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S )Nz__eng__ $A </s>z__eng__ $A $B </s>__eng__r  r   r  r   s    rY   r  z#SeamlessM4TConverter.post_processorA  sY    ,$%D3II)TTU0FFvNNO
 
 
 	
r[   Nr   r   r   rm   r   r  rk   r[   rY   r0  r0  3  sA          4 4 4
 
 
 
 
r[   r0  c                   8    e Zd Zd Zd Zd Zedd            ZdS )XLMRobertaConverterc                 R    g d}|d |j         dd          D             z  }|dgz  }|S )Nr  c                 *    g | ]}|j         |j        fS rk   r   r   s     rY   r{   z-XLMRobertaConverter.vocab.<locals>.<listcomp>T  r  r[   r   r  rz  r  s      rY   rm   zXLMRobertaConverter.vocabM  sK    
 
 
 	KK%,qrr:JKKKK/""r[   c                 
    d}|S r  rk   r  s      rY   r   zXLMRobertaConverter.unk_idX  r  r[   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S r  r  r   s    rY   r  z"XLMRobertaConverter.post_processor\  r  r[   Nc                    t          |                    dd                    }t          |                    dd                    }t          |                    dd                    }t          |                    dd                    }t          |                    d	d
                    }|df|df|df|dfg}|*|                    t          |          dd                     |                    |df           ||d<   |S )Nr=  r  r  r  r  r  r   r  r  r  r  r   rm   r  r  s	            rY   ro  z$XLMRobertaConverter.convert_from_spmf  s   

;6677	

;8899	

;7788	

;8899	L(;;<<
 	

 d5kk!""o...:s+,,,$wr[   r   r  rk   r[   rY   r9  r9  L  sa        	 	 	  
 
 
    [  r[   r9  c                        e Zd Zd Zd Zd ZdS )XLNetConverterc                 $    d |j         D             S )Nc                 t    g | ]5}t          |j                  r|j        |j        fn|j        |j        d z
  f6S r  r  r   s     rY   r{   z(XLNetConverter.vocab.<locals>.<listcomp>}  r  r[   rz  r{  s     rY   rm   zXLNetConverter.vocab|  r  r[   c                 f   t          j        dd          t          j        dd          g}| j        j        sL|                    t          j                               |                    t          j                               | j        j        r&|                    t          j                               |j	        j
        }|r'|                    t          j        |                     |                    t          j        t          d          d                     t          j        |          S r  r  r  s       rY   r   zXLNetConverter.normalizer  r  r[   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S )Nz$A:0 <sep>:0 <cls>:2z!$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2z<sep>z<cls>r   r  r   s    rY   r  zXLNetConverter.post_processor  r  r[   Nr  rk   r[   rY   r@  r@  {  r  r[   r@  c                       e Zd ZdS )ReformerConverterNr   r   r   rk   r[   rY   rF  rF            Dr[   rF  c                       e Zd Zd Zd ZdS )RemBertConverterc                 >   t          j        dd          t          j        dd          t          j        t          d          d          g}| j        j        sL|                    t          j                               |                    t          j                               | j        j        r&|                    t          j	                               |j
        j        }|r'|                    t          j        |                     t          j        |          S r  )r
   r  r   rd   r  r   r  r  r   r  r  r  r  rT  r  s       rY   r   zRemBertConverter.normalizer  s    c**c**g44

 &3 	@##K$4$6$6777##K$<$>$>???"0 	=##K$9$;$;<<<$4I 	S##K$;<P$Q$QRRR#$4555r[   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S r  r  r   s    rY   r  zRemBertConverter.post_processor  r  r[   N)r   r   r   r   r  rk   r[   rY   rJ  rJ    s2        6 6 6&
 
 
 
 
r[   rJ  c                       e Zd ZdS )BertGenerationConverterNrG  rk   r[   rY   rN  rN    rH  r[   rN  c                   &    e Zd Zd Zd Zd Zd ZdS )PegasusConverterc                 p   | j         j        df| j         j        dfg}| j         j        || j         j        dfgz  }| j         j        ,| j         j        | j         j        k     r|| j         j        dfgz  }|d t          d| j         j                  D             z  }|d |j        dd          D             z  }|S )Nr  c                     g | ]
}d | ddfS )z<unk_>r  rk   rz   r   s     rY   r{   z*PegasusConverter.vocab.<locals>.<listcomp>  s%    [[[Q<1<<<([[[r[   rr   c                 *    g | ]}|j         |j        fS rk   r   r   s     rY   r{   z*PegasusConverter.vocab.<locals>.<listcomp>  r  r[   )	rd   r  r  mask_token_sentr  mask_token_idoffsetr   r   r  s      rY   rm   zPegasusConverter.vocab  s    $.4$.4

 "2>t.>DEEE #.:'58O8VVVt.93?@@E[[%4;R;Y2Z2Z[[[[KK%,qrr:JKKKKr[   c                 4    |j         j        | j        j        z   S r   )r   r   rd   rX  r{  s     rY   r   zPegasusConverter.unk_id  s    !(4+B+IIIr[   c                     t          || j                  }t          j        t          j                    t          j        ||          g          S r  )rf   rd   r   rT  WhitespaceSplitr  r  s       rY   r  zPegasusConverter.pre_tokenizer  sO    ,-=t?VWW&.00([Q_```
 
 	
r[   c                 p    | j         j        }|| j         j        fg}t          j        d|gdd|g|          S )N$A$Br   )rd   r  eos_token_idr   r  )r   eosr   s      rY   r  zPegasusConverter.post_processor  sI    %/$)67
 ,T3KtTSVFWhvwwwwr[   N)r   r   r   rm   r   r  r  rk   r[   rY   rP  rP    sX          &J J J
 
 
x x x x xr[   rP  c                   2    e Zd Zd Zd Zedd            ZdS )T5Converterc                     | j         j        }d |j        D             }|d t          |dz
  dd          D             z  }|S )Nc                 *    g | ]}|j         |j        fS rk   r   r   s     rY   r{   z%T5Converter.vocab.<locals>.<listcomp>  s!    FFF%+u{+FFFr[   c                     g | ]
}d | ddfS )
<extra_id_rS  r  rk   rT  s     rY   r{   z%T5Converter.vocab.<locals>.<listcomp>  s)    UUUq$$$$c*UUUr[   r   r   )rd   
_extra_idsr   r   )r   r   num_extra_idsrm   s       rY   rm   zT5Converter.vocab  sR    /:FFFFFUUE-!:KRQS4T4TUUUUr[   c                 n    t          j        ddgg dd| j                            d          fg          S Nr]  r  )r]  r  r^  r  r   r  r   s    rY   r  zT5Converter.post_processor  J    ,&>---0FFvNNO
 
 
 	
r[   Nc                    |                     dd          }d t          |dz
  dd          D             }|t          |          ng }|                    d |D                        |                    d|           ||d<   |S )	N	extra_idsr  c                     g | ]}d | d	S )rf  rS  rk   rT  s     rY   r{   z0T5Converter.convert_from_spm.<locals>.<listcomp>  s$    PPPa)Q)))PPPr[   r   r   c              3      K   | ]}|d fV  	dS r  rk   rz   r   s     rY   r  z/T5Converter.convert_from_spm.<locals>.<genexpr>  s&      AA55#,AAAAAAr[   r   rm   )r   r   r   r   
setdefault)r  rm   r   rm  extra_tokensr  s         rY   ro  zT5Converter.convert_from_spm   s    JJ{C00	PP5QB3O3OPPP$)$5T%[[[2
AALAAAAAA5|DDD$wr[   r   )r   r   r   rm   r  r  ro  rk   r[   rY   rb  rb    sR          
 
 
    [  r[   rb  c                       e Zd Zd ZdS )UdopConverterc                 n    t          j        ddgg dd| j                            d          fg          S rj  r  r   s    rY   r  zUdopConverter.post_processor  rk  r[   Nr   r   r   r  rk   r[   rY   rt  rt    s#        
 
 
 
 
r[   rt  c                       e Zd ZdefdZdS )WhisperConverterr]   c           
      `   | j         j        }t          | j         j                                                  }t          t          ||d ddd                    }t          j        | j         j	                  |_
        t          j                    |_        | j         j        }| j                             |          }| j         j        }| j         j        }d                    d |D                       }t%          j        | d| d| d	| d
||fgt)          ||                    |_        |S )NrK   Fr6  r8  r  c                     g | ]}| d S )r   rk   rp  s     rY   r{   z.WhisperConverter.converted.<locals>.<listcomp>.  s    #G#G#GUuLLL#G#G#Gr[   z $A:0 r   z $A:0 $B:1 r   r   )rd   r,  r   r-  r.  r   r   r   r<  r\   r  r	   r  prefix_tokensconvert_ids_to_tokensr  r_  joinr   r  zipr  )	r   rm   r   r
  prefix_token_idsprefixesr`  r_  prefix_templates	            rY   r   zWhisperConverter.converted  sQ   '/d-7<<>>??*,#%  	
 	
	 #1":DLcLt"u"u"u	$.00	2@*@@AQRR%/.;((#G#Gh#G#G#GHH#-#@%44S444#77777l#X/00$
 $
 $
	  r[   Nr  rk   r[   rY   rx  rx    s/         9            r[   rx  c                       e Zd Zd ZdS )BigBirdConverterc           	          t          j        ddd| j                            d          fd| j                            d          fg          S r  r  r   s    rY   r  zBigBirdConverter.post_processor<  r  r[   Nrv  rk   r[   rY   r  r  ;  s#        
 
 
 
 
r[   r  c                       e Zd ZdefdZdS )CLIPConverterr]   c                 
   | j         j        }t          | j         j                                                  }| j         j        }t          t          ||d dddt          |                              }t          j
        t          j                    t          j        t          d          d          t          j                    g          |_        t!          j
        t!          j        t          d          dd	
          t!          j        d          g          |_        t)          j                    |_        t-          j        | j         j        | j         j        f| j         j        | j         j        fdd          |_        |S )NrK   r&  Frm   r   r'  r7  r(  r)  r   z\s+r  z9's|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+removedTrN  r8  rY  )rd   r,  r   r-  r.  r   r   r   r   r
   rT  rS  r  r   r  r   r   rU  r<  r  r	   r  r   rZ  r  r_  r=  r>  r  r2  s        rY   r   zCLIPConverter.convertedH  ss   '/d-7<<>>??+5	*,#)i..  

 

	  +3_ 3E&MM3 G GI^I`I`a 
  
	 #1"9$Z[[&  
 (%@@@	#
 	#
	 %.00	 $.#?(2D4K4XY(2D4K4XY"	$
 $
 $
	  r[   Nr  rk   r[   rY   r  r  G  s/        '9 ' ' ' ' ' 'r[   r  c                       e Zd ZdefdZdS )LayoutLMv2Converterr]   c           	         | j         j        }t          t          |t	          | j         j                                      }d}d}d}t          | j         d          r3| j         j        j        }| j         j        j	        }| j         j        j
        }t          j        d|||          |_        t          j                    |_        t	          | j         j                  }t	          | j         j                  }| j         j        }| j         j        }	t+          j        | d| d| d| d| d	||f||	fg
          |_        t1          j        d          |_        |S )Nr   FTr   r   r   r   r   r   r   r   r   r   r	  s
             rY   r   zLayoutLMv2Converter.converteds  sy   '-iT=T=^9_9_```aa	!&4*,=>> 	R%)%<%L%c" 3CQM 3CQM*9!7'#	 
  
  
	 #1"A"C"C	$)344$)344.;.;#-#@**3***5555c555l#l#$
 $
 $
	  %.d;;;	r[   Nr  rk   r[   rY   r  r  r  r  r[   r  c                       e Zd ZdefdZdS )BlenderbotConverterr]   c           
         | j         }|j        }t          |j                                                  }t          t          ||d ddd                    }t          j        |j	                  |_
        t          j                    |_        t          j        d|j         d|j        |j        fg          |_        |S )NrK   Fr6  r8  z$A:0 r   )r   r   )rd   r,  r   r-  r.  r   r   r   r<  r\   r  r	   r  r   r  r  r_  r  r[  s        rY   r   zBlenderbotConverter.converted  s    $
bl''))***,#%  	
 	
	 #1":BL_"`"`"`	$.00	#-#@+2<+++r/$
 $
 $
	  r[   Nr  rk   r[   rY   r  r    r]  r[   r  c                        e Zd Zd Zd Zd ZdS )XGLMConverterc                 T    g d}|d |j         dd          D             z  }|g dz  }|S )Nr  c                 *    g | ]}|j         |j        fS rk   r   r   s     rY   r{   z'XGLMConverter.vocab.<locals>.<listcomp>  r  r[   r   ))z<madeupword0>r  )z<madeupword1>r  )z<madeupword2>r  )z<madeupword3>r  )z<madeupword4>r  )z<madeupword5>r  )z<madeupword6>r  rz  r  s      rY   rm   zXGLMConverter.vocab  s[    
 
 
 	KK%,qrr:JKKKK  z  z  z  	zr[   c                 
    d}|S r  rk   r  s      rY   r   zXGLMConverter.unk_id  r  r[   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S )Nz</s> $Az</s> $A </s> </s> $Br  r  r   r  r   s    rY   r  zXGLMConverter.post_processor  sY    ,'/EEeLLM0FFvNNO
 
 
 	
r[   Nr7  rk   r[   rY   r  r    sA        	 	 	  
 
 
 
 
r[   r  c                   >    e Zd ZdZeZddhZ	 d Zd Zd Z	d Z
d Zd	S )
GemmaConverterTz<start_of_turn>z<end_of_turn>c                 ,    t          j        dd          S Nr  r  )r
   r  r{  s     rY   r   zGemmaConverter.normalizer  s    "3...r[   c                    | j         j        df| j         j        df| j         j        dfg}|d |j        dd          D             z  }t          d |D                       s.t          d t          |          D             d           }|d||<   |S )Nr  c                 *    g | ]}|j         |j        fS rk   r   r   s     rY   r{   z(GemmaConverter.vocab.<locals>.<listcomp>  r  r[   r   c              3   .   K   | ]}|d          dk    V  dS )r   r   Nrk   )rz   rl   s     rY   r  z'GemmaConverter.vocab.<locals>.<genexpr>  s*      //A1Q44<//////r[   c              3   8   K   | ]\  }}|d          dk    |V  dS )r   r   Nrk   )rz   r   rl   s      rY   r  z'GemmaConverter.vocab.<locals>.<genexpr>  s4      "V"VAQqTXEUEU1EUEUEUEU"V"Vr[   )r   r  )rd   r  r  r=  r   anynextr   )r   r   rm   override_indexs       rY   rm   zGemmaConverter.vocab  s    $.4$.4$.4

 	KK%,qrr:JKKKK /////// 	4!"V"V51A1A"V"V"VX\]]N)(3n%r[   c                 ,    t          j        dd          S )Nr  merged_with_previous)r   rU  r   r  r\   s      rY   r  zGemmaConverter.pre_tokenizer  s    #C)?@@@r[   c                 
    d}|S r  rk   r  s      rY   r   zGemmaConverter.unk_id  r  r[   c                     t          j        t          j        dd          t          j                    t          j                    g          S )Nr  r  )r	   rT  r  ByteFallbackFuser  s      rY   r  zGemmaConverter.decoder  sA      ,,%''
 
 	
r[   N)r   r   r   rs  r   r  r   r   rm   r  r   r  rk   r[   rY   r  r    s|        .L'9N/ / /   A A A  
 
 
 
 
r[   r  c                   6    e Zd ZdZd Zd Zd Zd Zd Zd Z	dS )	LlamaConverterTc                     | j                             d          df| j                             d          df| j                             d          dfg}|d |j        dd          D             z  }|S )Nr   r  r   rr   c                 *    g | ]}|j         |j        fS rk   r   r   s     rY   r{   z(LlamaConverter.vocab.<locals>.<listcomp>  r  r[   r   )rd   r|  r   r  s      rY   rm   zLlamaConverter.vocab  s    $::1==sC$::1==sC$::1==sC

 	KK%,qrr:JKKKKr[   c                 
    d}|S r   rk   r  s      rY   r   zLlamaConverter.unk_id  r  r[   c                     t          j        dd          t          j                    t          j                    g}|r|t          j        dd          gz  }t          j        |          S Nr  r  r   )contentr  r	   r  r  r  r  rT  r   r  r\   sequences       rY   r  zLlamaConverter.decoder  e    UC((!##MOO

  	>!<<<==H ***r[   c                     t          | j        dd          r_g }t          | j        dd          r|t          j        d          gz  }|t          j        dd          gz  }t          j        |          S d S )Nr`   Tr\   r  )prependr  )patternr  )rc   rd   r
   Prependr  rT  )r   r   r  s      rY   r   zLlamaConverter.normalizer  s    4*Hd;; 	2Ht.0BDII A[0???@@,S%HHHIIH'111tr[   c                     t          | j        dd          s,t          || j                  }t          j        ||d          S d S )Nr`   TFr  re   split)rc   rd   rf   r   r  r  s       rY   r  zLlamaConverter.pre_tokenizer(  sL    t.$?? 	q01A4CZ[[N!+Tbjopppptr[   c                     d S r   rk   r   s    rY   r  zLlamaConverter.post_processor.  s    tr[   N)
r   r   r   rs  rm   r   r  r   r  r  rk   r[   rY   r  r    st            + + +        r[   r  c                       e Zd ZdefdZdS )MarkupLMConverterr]   c                 (   | j         }|j        }t          |j                                                  }t          t          ||d ddd| j         j                            }t          j	        |j
                  |_        t          j	                    |_        t          | j         j                  }t          | j         j                  }| j         j        }| j         j        }t'          j        | d| | d| d| ||f||fg          |_        |S )NrK   Fr  r8  z $A z $B r   )rd   r,  r   r-  r.  r   r   r   r   r<  r\   r  r	   r  r   r  r  r  r  r   r  r  )	r   r\  rm   r   r
  r  r  r  r  s	            rY   r   zMarkupLMConverter.converted4  s1   $
bl''))***,#%1;  

 

	 #1":BL_"`"`"`	$.00	$)344$)344.;.;#-#@$$s$$++S++c++l#l#$
 $
 $
	  r[   Nr  rk   r[   rY   r  r  3  s/        "9 " " " " " "r[   r  c                   *    e Zd ZdZd Zd Zd Zd ZdS )MoshiConverterTc                 L   t          | d           t                              | |           t                      }|                                }t          |d          5 }|                    |                                           d d d            n# 1 swxY w Y   || _        d S NrR   r   	r   r   r   rZ   r   r   r   r   r   r   rr  r   r   r   r   s         rY   r   zMoshiConverter.__init__\  s    $
+++4,,, $%%	  ""*d## 	(qaffhh'''	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	(


   (BBBc                     |j         j        }t          j        dd          g}|st          j        |          S t          j        t          j        |          g|z             S r  )r  r  r
   r  rT  r  r  s       rY   r   zMoshiConverter.normalizeri  sg    $4IU++
 $ 	h'555')@AU)V)V(WZf(fgggr[   c                     t          j        dd          t          j                    t          j                    g}|r|t          j        dd          gz  }t          j        |          S r  r  r  s       rY   r  zMoshiConverter.decoders  r  r[   c                 4    d}t          j        ||d          S )Nra   Fr  )r   r  r  s       rY   r  zMoshiConverter.pre_tokenizer}  s!     'KP^fkllllr[   N)r   r   r   rs  r   r   r  r  rk   r[   rY   r  r  Y  s^          h h h+ + +m m m m mr[   r  c                   D    e Zd ZdZddZd Zd Zd Zd Zd Z	d	 Z
d
 ZdS )HeliumConverterTNc                 L   t          | d           t                              | |           t                      }|                                }t          |d          5 }|                    |                                           d d d            n# 1 swxY w Y   || _        d S r  r  r  s         rY   r   zHeliumConverter.__init__  s    $
+++4,,,#%%	  ""*d## 	(qaffhh'''	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	(


r  c                                           |          }t          t          |                     |           j                            } fdt          |j                  D             }|                    d t          |d           D                        |                    t          ddd          g           |
                    d	d
           |S )Nr~  c                 j    g | ]/\  }}|j         d v ||j        |j         dk    p|j        j        v f0S r   r  r  s      rY   r{   z-HeliumConverter.tokenizer.<locals>.<listcomp>  r  r[   c                 <    g | ]\  }}}t          |d |d          S )FT)r   r   single_wordr   r   s       rY   r{   z-HeliumConverter.tokenizer.<locals>.<listcomp>  s@       &Bw 5UGQUVVV  r[   c                     | d         S r   rk   r   s    rY   rn   z+HeliumConverter.tokenizer.<locals>.<lambda>  r  r[   ro   
Fr   r  r   )r  pad_id)rm   r   r   r   rs  r   r   r  r   r   enable_padding)r   r   r   r
  r   s   `    rY   r
  zHeliumConverter.tokenizer  s   zz%(({{5))"7  
 
	
 
 
 
"5<00
 
 

 	 *01A~~*V*V*V  	
 	
 	
 	j%OOOPQQQ  71 ===r[   c                 t    g }|j         D ]-}|j        dk    r|d|j        fgz  }||j        |j        fgz  }.|S )Nz<0x0A>r  )r   r   r   )r   r   rm   r   s       rY   rm   zHeliumConverter.vocab  sX    \ 	6 	6E{h&&4-..5;455r[   c                 
    d}|S r   rk   r  s      rY   r   zHeliumConverter.unk_id  r  r[   c                     t          j        dd          t          j                    t          j                    g}|t          j        dd          gz  }t          j        |          S r  r  r  s       rY   r  zHeliumConverter.decoder  s]    UC((!##MOO

 	X^Ca88899 ***r[   c                 x    t          j        t          j        d          t          j        dd          g          S r  )r
   rT  r  r  r{  s     rY   r   zHeliumConverter.normalizer  s2    #[%8%=%={?RSWY^?_?_$`aaar[   c                 R    t          j        t          j        dd          g          S )Nr  
contiguous)r   rT  rU  r  s      rY   r  zHeliumConverter.pre_tokenizer  s#    &(<T<(P(P'QRRRr[   c                 :    t          j        ddgg ddg          S )Nr  r]  )r  r]  r  r^  )r  r   r   )r   r  r   s    rY   r  zHeliumConverter.post_processor  s@    ,   
 
 
 	
r[   r   )r   r   r   rs  r   r
  rm   r   r  r   r  r  rk   r[   rY   r  r    s        
 
 
 
  8    + + +b b bS S S
 
 
 
 
r[   r  c                        e Zd ZdZddZd ZdS )ParakeetConverterTNc                 Z   || _         t          | d           t                              | |           t	                      }|                                }t          |d          5 }|                    |                                           d d d            n# 1 swxY w Y   || _	        d S r  )
rr  r   r   r   rZ   r   r   r   r   r   )r   rr  rv  r   r   r   s         rY   r   zParakeetConverter.__init__  s    $$
+++4,,,#%%	  ""*d## 	(qaffhh'''	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	(


s   %(BB Bc           
                                |          }                      j                                      |          \  }}d t	          |          D             }t          t          |||j        j        d j	        d                     } fdt	          |j
                  D             }|                    d t          |d           D                        |S )Nc                      i | ]\  }\  }}||S rk   rk   r   s       rY   r   z/ParakeetConverter.tokenizer.<locals>.<dictcomp>  s#    MMM!1MT5T1MMMr[   Tr  c                 j    g | ]/\  }}|j         d v ||j        |j         dk    p|j        j        v f0S r   r  r  s      rY   r{   z/ParakeetConverter.tokenizer.<locals>.<listcomp>  r  r[   c                 :    g | ]\  }}}t          |d |          S r   r   r   s       rY   r{   z/ParakeetConverter.tokenizer.<locals>.<listcomp>  r  r[   c                     | d         S r   rk   r   s    rY   rn   z-ParakeetConverter.tokenizer.<locals>.<lambda>  r  r[   ro   )rm   r  rr  r   r   r   r   r   r  rs  r   r  r   )r   r   r   r  r   r  r
  r   s   `       rY   r
  zParakeetConverter.tokenizer  s   zz%((%%do66>>|LL	6MMY|5L5LMMM	,6"7  	
 	
	
 
 
 
"5<00
 
 

 	 *01A~~*V*V*V  	
 	
 	
 r[   r   )r   r   r   rs  r   r
  rk   r[   rY   r  r    s=               r[   r  c            	      \   t          t          t          d          t          d          dz                       t          t          t          d          t          d          dz                       z   t          t          t          d          t          d          dz                       z   } | dd         }d	}t          d
          D ]8}|| vr2|                     |           |                    d
|z              |dz  }9d |D             }t	          t          | |                    S )a8  
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    !~r      ¡   ¬   ®   ÿNr      c                 ,    g | ]}t          |          S rk   )chr)rz   ns     rY   r{   z$bytes_to_unicode.<locals>.<listcomp>  s    			Q#a&&			r[   )r   r   ordr   r}   r~  )bscsr  bs       rY   bytes_to_unicoder    s    	U3s88SXX\**++d5TCIIPQM3R3R.S.SSVZ[`adeiajajloptluluxyly[z[zV{V{{  
AAAB	A4[[  B;;IIaLLLIIdQhFA		"			BBr[   c                   @    e Zd ZdZ	 	 	 	 ddZdefdZd Zd	efd
Z	dS )TikTokenConverterz'
    A general tiktoken converter.
    Ns(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+Fc                     || _         || _        || _        t          |t                    r|                                n|| _        d S r   )rr  r  r\   r,  r}   r.  extra_special_tokens)r   rr  r  r\   r  r   s         rY   r   zTikTokenConverter.__init__(  sN     % 0+56JD+Q+Qk %%'''Wk 	!!!r[   tiktoken_urlc                 f   	 ddl m} n# t          $ r t          d          w xY w ||          t	                      fdg }i }                                D ]\  }}|| |          <   t          |          dk    r'g }t          dt          |                    D ]=}|d |         ||d          }
}	|	v r#|
v r|	|
z   v r|                    |	|
|f           >t          |fdd          }|
                    |           t          |d	 d          }fd
|D             }||fS )Nr   )load_tiktoken_bpezY`tiktoken` is required to read a `tiktoken` file. Install it with `pip install tiktoken`.c                 l    d                     fd|                     d          D                       S )NrK   c                 :    g | ]}t          |                   S rk   r  rz   charbyte_encoders     rY   r{   zdTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string.<locals>.<listcomp>C  $    TTTLT3TTTr[   latin-1r}  decoder  r   s    rY   token_bytes_to_stringzPTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_stringB  s6    77TTTT@S@STTTUUUr[   r   c                 <    | d                  | d                  fS rj   rk   )rl   r-  s    rY   rn   zCTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>P  s    1Q4)AaD/0R r[   Frv   c                     | d         S Nrr   rk   rt   s    rY   rn   zCTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>R  
    A r[   c                 T    g | ]$} |d                     |d                   f%S ry   rk   rz   ru   r  s     rY   r{   zETikTokenConverter.extract_vocab_merges_from_model.<locals>.<listcomp>S  ?    cccUX((Q002G2GA2O2OPcccr[   )tiktoken.loadr  r  
ValueErrorr  r~   rs   r   r   r   r   )r   r  r  r   rm   r   rankr   r   r   r   r-  r   r  s              @@@rY   extract_vocab_merges_from_modelz1TikTokenConverter.extract_vocab_merges_from_model7  s   	7777777 	 	 	k  	
 &%l33	'))	V 	V 	V 	V 	V $??,, 
	! 
	!KE426E''../5zzQEq#e**-- ; ;#(%=%-i''Gy,@,@gPWFW\eEeEeLL'7D!9:::5&R&R&R&R\abbbEMM%    $6$6FFFcccc\bcccf}s    &c                     |                      | j                  \  }}t          t          ||d                    }t	          |j        d          rd|j        _        |S NF)r)  ignore_mergesTr  rr  r   r   r   r   r  r   r   r   r
  s       rY   r
  zTikTokenConverter.tokenizerV  \    #CCDOTTfc,GGGHH	9?O44 	1,0IO)r[   r]   c                    |                                  }t          j        t          j        t	          | j                  dd          t          j        | j        d          g          |_        t          j                    |_
        | j        $|                    d | j        D                        t          j        d          |_        |S )NrM  FrN  rQ  c                 2    g | ]}t          |d d          S )FTr   r   rp  s     rY   r{   z/TikTokenConverter.converted.<locals>.<listcomp>i  s'    jjjuEeTBBBjjjr[   r:  )r
  r   rT  rU  r   r  r<  r\   r  r	   r  r  r0  r   r  r   r
  s     rY   r   zTikTokenConverter.converted]  s    NN$$	"0"9$U4<%8%8:V[\\\($:O[`aaa#
 #
	 %.00	$0((jjPTPijjj   $.#7U#K#K#K	 r[   Nr  FN)
r   r   r   r   r   r   r  r
  r   r   rk   r[   rY   r  r  #  s           K!
 
 
 
C    >  9      r[   r  c                   <    e Zd Z	 	 	 	 d
dZdefdZd Zdefd	ZdS )MistralConverterNr  Fc                     || _         || _        || _        t          |t                    r|                                n|| _        d S r   )rr  r  r\   r,  r}   r.  r   )r   rr  r  r\   r   r   s         rY   r   zMistralConverter.__init__r  sR     % 0 3T::+%**,,,* 	&&&r[   r  c                    dd l dd l}t          | j        dd          5 }|                    |          }d d d            n# 1 swxY w Y   |d         d         | _        d |d         D             | _        |d	         }t                      t          fd
            g }i }t          | j                  D ]\  }}	|||	j
        <   fd|D             }t          |          }
d t          |          D             t          t          |d                    D ]\  }}	|| |	          <   t          |	          dk    r'g }t          dt          |	                    D ]=}|	d |         |	|d          }}||
v r#||
v r||z   |
v r|                    |||f           >t!          |fdd          }|                    |           t!          |d d          }fd|D             }||fS )Nr   rzutf-8)encodingconfigr  c                 H    g | ]}t          |d          |d                    S )	token_str
is_control)r   r   )rz   ks     rY   r{   zDMistralConverter.extract_vocab_merges_from_model.<locals>.<listcomp>  s:     *
 *
 *
DEJq~q???*
 *
 *
r[   r   rm   c                 l    d                     fd|                     d          D                       S )NrK   c                 :    g | ]}t          |                   S rk   r  r  s     rY   r{   zcMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string.<locals>.<listcomp>  r  r[   r  r  r  s    rY   r  zOMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string  s6    77TTTT@S@STTTUUUr[   c                 F    g | ]}                     |d                    S )token_bytes)	b64decode)rz   r&  base64s     rY   r{   zDMistralConverter.extract_vocab_merges_from_model.<locals>.<listcomp>  s,    KKKAV%%a&677KKKr[   c                     i | ]\  }}||	S rk   rk   )rz   r  r   s      rY   r   zDMistralConverter.extract_vocab_merges_from_model.<locals>.<dictcomp>  s    MMMuMMMr[   z(Converting tekken.json to tokenizer.json)descr   c                 <    | d                  | d                  fS rj   rk   )rl   token_to_ranks    rY   rn   zBMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>  s!    qt1DmTUVWTXFY0Z r[   Frv   c                     | d         S r	  rk   rt   s    rY   rn   zBMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>  r
  r[   c                 T    g | ]$} |d                     |d                   f%S ry   rk   r  s     rY   r{   zDMistralConverter.extract_vocab_merges_from_model.<locals>.<listcomp>  r  r[   )r,  jsonr   rr  loadr  r   r  r   r   r  r|   r   rs   r   r   r   r   )r   r  r3  r   untypedr-  r   rm   idxr   rank_setr  r   r   r   r   r,  r   r  r0  s                   @@@@rY   r  z0MistralConverter.extract_vocab_merges_from_model  s   $/3999 	#QiillG	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	#x(3*
 *
IPQaIb*
 *
 *
& G$	'))		V 	V 	V 	V 
	V #D$BCC 	' 	'JC#&E%-  KKKKKKK	y>>MM	)8L8LMMM$T):d%e%e%eff 
	! 
	!KD%26E''../5zzQEq#e**-- ; ;#(%=%-h&&7h+>+>GgDUZbCbCbLL'7D!9:::5&Z&Z&Z&ZdijjjEMM%    $6$6FFFcccc\bcccf}s   AA
A
c                     |                      | j                  \  }}t          t          ||d                    }t	          |j        d          rd|j        _        |S r  r  r  s       rY   r
  zMistralConverter.tokenizer  r  r[   r]   c                 |   |                                  }t          j        t          j        t	          | j                  dd          t          j        | j        d          g          |_        t          j                    |_
        |                    | j                   t          j        d          |_        |S )NrM  FrN  rQ  r:  )r
  r   rT  rU  r   r  r<  r\   r  r	   r  r  r   r   r  r  s     rY   r   zMistralConverter.converted  s    NN$$	"0"9$U4<%8%8:V[\\\($:O[`aaa#
 #
	 %.00	T;<<<#-#7U#K#K#K	 r[   r  )	r   r   r   r   r   r  r
  r   r   rk   r[   rY   r  r  q  s          K"&
 
 
 
"%C % % % %N  9      r[   r  AlbertTokenizerBartTokenizerBarthezTokenizerBertTokenizerBigBirdTokenizerBlenderbotTokenizerCamembertTokenizerCLIPTokenizerCodeGenTokenizerConvBertTokenizerDebertaTokenizerDebertaV2TokenizerDistilBertTokenizerDPRReaderTokenizerDPRQuestionEncoderTokenizerDPRContextEncoderTokenizerElectraTokenizerFNetTokenizerFunnelTokenizerGPT2TokenizerHerbertTokenizerLayoutLMTokenizerLayoutLMv2TokenizerLayoutLMv3TokenizerLayoutXLMTokenizerLongformerTokenizerLEDTokenizerLxmertTokenizerMarkupLMTokenizerMBartTokenizerMBart50TokenizerMPNetTokenizerMobileBertTokenizerMvpTokenizerNllbTokenizerOpenAIGPTTokenizerPegasusTokenizerQwen2TokenizerReformerTokenizerRemBertTokenizerRobertaTokenizerRoFormerTokenizerSeamlessM4TTokenizerSqueezeBertTokenizerT5TokenizerUdopTokenizerWhisperTokenizerXLMRobertaTokenizerXLNetTokenizerSplinterTokenizerXGLMTokenizer)LlamaTokenizerCodeLlamaTokenizerGemmaTokenizerPhi3TokenizerFc                 ^   | j         j        }|t          v r,|s*t          |         } ||                                           S | j                            d          rG| | _        t                              d           t          | j                                                  S 	 t                              d           t          | j        | j                                                  S # t          $ r7 t          dt          t                                                               w xY w)a  
    Utilities to convert a slow tokenizer instance in a fast tokenizer instance.

    Args:
        transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
            Instance of a slow tokenizer to convert in the backend tokenizer for
            [`~tokenization_utils_base.PreTrainedTokenizerFast`].
       from_tiktoken (bool, optional): Whether to use the `tiktoken` library to convert the tokenizer instead of sentencepiece.
            Defaults to False.

    Return:
        A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
        [`~tokenization_utils_base.PreTrainedTokenizerFast`]
    ztekken.jsonz#Converting from Mistral tekken.jsonzConverting from Tiktoken)rr  r  zConverting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: )rw  r   SLOW_TO_FAST_CONVERTERSr   rr  endswithrd   loggerinfor  r  r  r  r  r   r.  )transformer_tokenizerfrom_tiktokentokenizer_class_nameconverter_classs       rY   convert_slow_tokenizerrz    s;     1:C666}612FG455??AAA		)	2	2=	A	A 3H09::: 5 @AAKKMMM	KK2333$0;%:%O   ikk  	 	 	e>BCZC_C_CaCa>b>be e  	s   $AC+ +AD,)rK   r   )F)Wr   rt  collections.abcr   	functoolsr   	packagingr   
tokenizersr   r   r   r	   r
   r   r   r   r   r   r   r   utilsr   r   r   r   utils.import_utilsr   
get_loggerr   rt  r  r   rZ   boolr   rf   r   r   r   r   r   r   r  r  r!  r$  r4  rC  rJ  rW  r_  rf  rm  r  r  r  r  r  r  r"  r0  r9  r@  rF  rJ  rN  rP  rb  rt  rx  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rr  rz  rk   r[   rY   <module>r     s     & & & & & &             f f f f f f f f f f f f f f f f f f 5 5 5 5 5 5 5 5 5 5       ` ` ` ` ` ` ` ` ` ` ` ` 5 5 5 5 5 5 
	H	%	%  8 $ ' ' '  >G G G G"$ s     jo6L    01 1 1 1 1 1 1 1h    "8   "Ic Id I I I I$ $ $ $ $ $ $ $$ $ $ $ $I $ $ $N/ / / / /	 / / /d$ $ $ $ $i $ $ $N$ $ $ $ $Y $ $ $N       6# # # # #I # # #L    y   >) ) ) ) )Y ) ) )X    y   :$ $ $ $ $	 $ $ $N    y   >H H H H H9 H H HV"
 "
 "
 "
 "
l "
 "
 "
J
 
 
 
 
| 
 
 
 - - - - - - - -`
 
 
 
 
 
 
 
BG G G G G\ G G GT- - - - -| - - -`, , , , ,L , , ,^
 
 
 
 
< 
 
 
2, , , , ,, , , ,^"
 "
 "
 "
 "
\ "
 "
 "
J	 	 	 	 	 	 	 	
 
 
 
 
| 
 
 
@	 	 	 	 	l 	 	 	%x %x %x %x %x| %x %x %xP    ,   8
 
 
 
 
L 
 
 
! ! ! ! !y ! ! !H	
 	
 	
 	
 	
| 	
 	
 	
( ( ( ( (I ( ( (V$ $ $ $ $) $ $ $N    )   :
 
 
 
 
L 
 
 
61
 1
 1
 1
 1
\ 1
 1
 1
h+ + + + +\ + + +\# # # # #	 # # #L&m &m &m &m &m\ &m &m &mRV
 V
 V
 V
 V
l V
 V
 V
r- - - - - - - -`  0K K K K K K K K\M M M M M M M M`88%8 (8 ]	8
 (8 .8 ,8 ]8 8 8 (8 ,8 =8 -8 "=8  !-!8" #8 8$ _%8& '8( ])8* (+8, -8. =/80 +182 -384 +586 $788 }98: *;8< n=8> (?8@ nA8B =C8D $E8 8 8F ]G8H ,I8J (K8L nM8N *O8P (Q8R (S8T *U8V 0W8X MY8Z ;[8\ ]]8^ (_8` .a8b nc8d *e8f ]g8 8h %($#o8 8 8 v$ $) $ $ $ $ $ $r[   