
    bi                        d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2m3Z3m4Z4  e4j5        e6          Z7dZ8dZ9dZ:dZ;dZ<e*dz  Z*ee e!e"dZ=e8e;dZ> e3e*           G d de-                      Z?e?Z@dS )z
Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
see tokenization_utils.py
    N)defaultdict)Iterable)copyfile)Any)is_offline_mode)
AddedToken
processors)Encoding)	Tokenizer)Decoder)BPEUnigram)
BpeTrainerUnigramTrainerWordLevelTrainerWordPieceTrainercached_file   )convert_gguf_tokenizer)load_gguf_checkpoint)INIT_TOKENIZER_DOCSTRINGBatchEncodingPreTokenizedInputPreTrainedTokenizerBase	TextInputTruncationStrategygenerate_merges)PaddingStrategyadd_end_docstringsloggingztokenizer.jsonzspecial_tokens_map.jsonztokenizer_config.jsonztokenizer.modelzadded_tokens.jsonu  
        tokenizer_object ([`tokenizers.Tokenizer`]):
            A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
            tokenizers](../fast_tokenizers) for more information.
        tokenizer_file ([`str`]):
            A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
            tokenizers.
)r   r   	WordLevel	WordPiece)tokenizer_file
vocab_filec            )           e Zd ZdZeZdZdZedLd            Z	 fdZ
edefd            Zedefd            ZdMd	ed
edz  dee         fdZd Zed             Zed             Zej        d             Zej        d             Zd Zedefd            Zdeeef         fdZedeeef         fd            Zedeeef         fd            Zedeeef         fd            ZeZeZ deeef         fdZ!defdZ"defdZ#ede$fd            Z%ede&fd            Z'	 	 	 	 	 	 	 dNde(dedz  dedz  d ed!ed"ed#ed$edeeee)f         e*e(         f         fd%Z+d&edefd'Z,d(ededz  fd)Z-dLd*e*eez           defd+Z.dLd,edefd-Z/dLd.ee*e         z  d/edee*e         z  fd0Z0dOd1ed,edz  d2ede*e         fd3Z1d4e2d5e3d6ed7ed8edz  d9edz  fd:Z4dde2j5        e3j6        dd;ddddddddddddfd1e7e8z  e*e7         z  e*e8         z  d<e7e8z  e*e7         z  e*e8         z  dz  d2ed4e2d5e3d6edz  d7ed=ed8edz  d9edz  d>edz  dedz  dedz  d ed!ed"ed#ed$ed?edz  de9f(d@Z:dAe*e         defdBZ;	 	 dPdCee*e         z  d/edDedz  defdEZ<	 	 dQd	ee=j>        z  dFeedGf         dHedz  d
edz  deedGf         f
dIZ?	 	 	 dRdJZ@e	 	 	 	 	 	 	 dSdK            ZA xZBS )TTokenizersBackendaQ  
    Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).

    Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].

    Handles all the shared methods for tokenization and special tokens, as well as methods for
    downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.

    This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
    NFc                    t          |          }|                    dd          }|Lt          j                            |          r-| t
          u sd| j        vs|rt          j        |          |d<   |S |t          j                            |          rt          j        |          j	        }t          |d          5 }t          j        |          }ddd           n# 1 swxY w Y   |                    di                               dd          }| j        8t          |t                     r"t!          t#          t$          |                    }n| j        j        d	k    r1|r.t          |d
         t           t$          f          rd |D             }nx| j        j        dk    rd t)          |          D             }nN| j        j        dk    s| j        j        dk    r.t          |t                     rd t)          |          D             }||d<   t+          | dd          }	d|                    di           v r,|	r*|	j        dk    r|d         d         }
d |
D             }
|
|d<   |||d<   |S |                    d          }|                    d          }|                    d          }|                    d          }
t          |t,                    rd|                    d          rOt          j                            |          r0ddlm}  ||                              |          \  |d<   |d<   |S t          |t,                    rqt          j                            |          rQ|                    d          r;	 ddlm}   ||          j        | j        fi |}	 ddlm} |                    | j                  }|t=          |d          r |j        d-i |}n=# t@          $ r0}tB          "                    d| j         d| d            Y d}~nd}~ww xY wt=          | d!          r | j#        d-i |}nz# t@          $ rm}tB          "                    d"| d#| d$           dd%lm$}  |||                    d&          '                              |          \  |d<   |d<   Y d}~nd}~ww xY w|S |At          |t,                    r,t          j                            |          r||d<   |d         }|
At          |t,                    r,t          j                            |          r||d<   |d         }
|
| j        | j        j        dk    rt          |t                     rd(tJ          tL                   d)t           t,                   ffd*g d+}tO                      }|D ]+}||v r%|(                     ||         g                     ,tS          ||,          }
|
|d<   |S ).zs
        Build a `tokenizers.Tokenizer` backend from the available serialization files (tokenizer.json, sentencepiece
        models, tekken.json, vocab/merges).
        r$   N__init__tokenizer_objectutf-8encodingmodelvocabr   r   c                 ,    g | ]}t          |          S  )tuple).0items     e/root/projects/butler/venv/lib/python3.11/site-packages/transformers/tokenization_utils_tokenizers.py
<listcomp>z>TokenizersBackend.convert_to_native_format.<locals>.<listcomp>   s    ;;;TU4[[;;;    r"   c                     i | ]\  }}||	S r1   r1   r3   itokens      r5   
<dictcomp>z>TokenizersBackend.convert_to_native_format.<locals>.<dictcomp>   s    CCChaCCCr7   r   r#   c                 T    i | ]%\  }}t          |t                    r|d          n||&S r   )
isinstancelistr9   s      r5   r<   z>TokenizersBackend.convert_to_native_format.<locals>.<dictcomp>   s8    pppS[STV[E4)@)@KU1XXeQpppr7   mergesc                     g | ]H}t          |t                    r"t          |                    d                     nt          |          IS ) )r?   strr2   split)r3   merges     r5   r6   z>TokenizersBackend.convert_to_native_format.<locals>.<listcomp>   sK    rrrbgZs5K5K]%C 0 0111QVW\Q]Q]rrrr7   post_processorr%   merges_fileztekken.jsonr   )MistralConverter)r%   .model)SentencePieceExtractor)SLOW_TO_FAST_CONVERTERSconvert_from_spmz,Could not reorder vocab using converter for z due to z/. Falling back to raw SentencePiece extraction.convert_from_spm_modelz+Could not extract SentencePiece model from z$ using sentencepiece library due to z%. Falling back to TikToken extractor.)TikTokenConverterextra_special_tokens)r%   rP   valuesreturnc                     g }| D ]b}|t          |t          t          f          r|                     |                     @|                    t          |                     c|S N)r?   r@   r2   extendappendrD   )rQ   	collectedval_iter_special_tokenss      r5   rY   zHTokenizersBackend.convert_to_native_format.<locals>._iter_special_tokens   s{    ')	! 3 3C{ !#e}55 3!(()=)=c)B)BCCCC!((S2222  r7   )		pad_token	unk_token	bos_token	eos_token	sep_token	cls_token
mask_tokenadditional_special_tokensrP   )skip_tokensr1   )*dictpopospathisfiler'   __dict__TokenizerFast	from_filerG   openjsonloadgetr.   r?   r@   mapr2   __name__	enumerategetattrrD   endswithconvert_slow_tokenizerrI   extract_vocab_merges_from_modelrK   extractrL   hasattrrM   	ExceptionloggerwarningrN   rO   r   r   setupdater   )clstrust_remote_codekwargslocal_kwargsfast_tokenizer_file	processortokenizer_handletokenizer_jsonr/   
model_typerA   r%   rH   rI   rK   rL   converter_classerO   special_tokens_keysrb   keyrY   s                         @r5   convert_to_native_formatz*TokenizersBackend.convert_to_native_formatd   s[    F||*../?FF  +233 ,)))Zs|-K-KO`-K/</FGZ/[/[L+, ,@S1T1T,%/0CDDSI)G<<< =@P!%+;!<!<= = = = = = = = = = = = = = ="&&w3377FFEy eT** 4 UE!2!233E#y00 <Za4-@@ <;;U;;;E#{22CC)E2B2BCCC#u,,	0Bk0Q0QeT** qpp_hin_o_opppE$)L! gt44J>--gr::::
:zObfkOkOk'0:rrkqrrr)/X&$1:-.!%%l33
"&&}55  ))!!(++ j#&& 	 :+>+>}+M+M 	 RTRYR`R`akRlRl 	 @@@@@@<L<L%= = =--j99 :L!<#9   j#&& 	 27>>*+E+E 	 *J]J]^fJgJg 	 >JJJJJJI55jAAI#)ddWcdd	OOOOOO&=&A&A#,&O&OO&2wPb7c7c2'G'G'W'W,'W'W    NN Ps|  P  P]^  P  P  P       
 3 899 N#=3#=#M#M#M#ML 	> 	> 	>:* : :rs : : :   FEEEEE@Q@Q)@P@PQg@h@hA A A11*== >W%|H'='='='='='=	>   =Z
C88=RW^^J=W=W=$.L! )E>jc::>rw~~k?Z?Z>%0L"!(+F >ci3	8Je8S8SXbchjnXoXo8S	!Xc] 	!tCy 	! 	! 	! 	! 	! 	!
# 
# 
# %(EEK* R R,&&&&';';\#=N<O'P'PQQQ$UDDDF%+L"sU   C##C'*C'"Q 0?O0 /Q 0
P*:&P% Q %P** Q 
SA#R==Sc           	      p   |                     dd           }|                     dd           }|                     dd           }|                    di           }|                    dd          }|                    d          }|                    d          }	|                    d	          }
d }|t          j        |          }n|5t          j                            |          rt          j        |          }n|t          |                    d
d          |fi |}t          |          }|d         d         }|d         }|d         }t          ||          \  }}|                    |           t          |          dk    r|                    |           n| j        |	|
Qt          |	t                     r|	nd t#          |	          D             }t          t%          ||
dd                     }nt          |	t                     r!t          t%          |	g dd                     }nt          |	t&                    rV|	rTt          |	d         t(          t&          f          r2t          t+          |	|                    dd                              }n| j        t-          d          |5|3| j        ,|                    dd           |                    dd           ||| _        | j        t-          d          | j        j        }| | j        j        d8i | |                    d|d                    |                    d|d                    |                    d|d                    |                    d |d!                    n| j                                         | j        j        }| | j        j        d8i | |                    d"|d"                    |                    d#|d$                    |                    d%|d                    |                    d|d&                    |                    d'|d'                    d(|vrd)|d(<   d*|v pd+|v }|                    d*d          | _        |                    d+d          | _        |                     d,d           x}r|| j        _        |p| j        j        d u | _          tC                      j"        d8i | ||| _#        || _$        | j%        | j        _&        d- | j'        D             fd.tQ          |)                                d/ 0          D             }t'          | j*        +                                          d1 |D             z   }| j,        -                                D ]/}|t]          |          |vr||vr|/                    |           0| j0        D ],}t]          |          |vr||vr|/                    |           -t          |          dk    rg }d2 | j,        -                                D             }|D ]r}t          |t\                    rtc          |d3          }n4t          |tb                    r|j2        st]          |          |v rd|_2        |/                    |           s|r| 3                    |           	 | j        4                                }n# tj          $ r d}Y nw xY w|d4k    rytm          | j        d5d           c|                     dd             | j7        | j        | j8                            d
d           f| j8        |                    d6          d7|| _        | j         p| j        j        d u | _         | j         r| 9                                 d S d S )9Nr*   	gguf_filer$   added_tokens_decoderadd_prefix_spaceFr%   r/   rA   name_or_path configr   	tokenizertokenizer_configr   c                      i | ]\  }\  }}||S r1   r1   )r3   r:   w_s       r5   r<   z.TokenizersBackend.__init__.<locals>.<dictcomp>  s%    CkCkCkYQPVQRTUAqCkCkCkr7   T)r/   rA   fuse_unkdropoutunk_id)r/   r   a9  Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one.r\   z<s>r]   z</s>z3The backend tokenizer is not correctly initialized.
max_lengthtruncation_side	directionstridetruncation_strategystrategyrZ   pad_token_type_idpad_type_idpadding_sidelengthpad_to_multiple_ofbackend
tokenizersadd_bos_tokenadd_eos_tokenrG   c                 F    h | ]}t          t          |                    S r1   hashreprr3   r;   s     r5   	<setcomp>z-TokenizersBackend.__init__.<locals>.<setcomp>K  s&    $^$^$^5T$u++%6%6$^$^$^r7   c                 V    g | ]%\  }}t          t          |                    v#|&S r1   r   )r3   indexr;   added_tokens_decoder_hashs      r5   r6   z.TokenizersBackend.__init__.<locals>.<listcomp>L  sA     
 
 
uDKK  (AAA AAAr7   c                     | d         S Nr   r1   )xs    r5   <lambda>z,TokenizersBackend.__init__.<locals>.<lambda>N  s    STUVSW r7   r   c                 ,    g | ]}t          |          S r1   rD   r   s     r5   r6   z.TokenizersBackend.__init__.<locals>.<listcomp>Q  s    ;b;b;b5CJJ;b;b;br7   c                 0    g | ]}|t          |          S r1   r   )r3   ts     r5   r6   z.TokenizersBackend.__init__.<locals>.<listcomp>b  s$    WWW1UVWAWWWr7   )speciali pre_tokenizerfix_mistral_regex)init_kwargsr   r1   ):rd   rn   copydeepcopyre   rf   rg   ri   rj   r   r   r   r|   len
_tokenizerr?   rc   rq   r   r@   r2   r   
ValueError
setdefault
truncationenable_truncationno_truncationpaddingenable_padding_add_bos_token_add_eos_tokenrG   _should_update_post_processorsuperr)   r%   r   split_special_tokensencode_special_tokensr   sorteditemsadded_tokens_encoderkeys_special_tokens_maprQ   rD   rV   _extra_special_tokensr   r   
add_tokensget_vocab_sizeNotImplementedErrorrr   _patch_mistral_regexr   update_post_processor) selfargsr   r*   r   r   r   r   r%   r/   rA   fast_tokenizer	gguf_path
gguf_paramarchitecturetokenizer_dictr   additional_kwargs
vocab_dict_truncation_paddingexplicit_bos_eos_in_kwargsrG   tokens_to_addencoderspecial_token_valuer;   tokensall_named_tokens
vocab_sizer   	__class__s                                  @r5   r)   zTokenizersBackend.__init__   sv   !::&8$??JJ{D11	$jj)94@@%zz*@"EE!::&8%@@ZZ--


7##H%%'!]+;<<NN ,@S1T1T,*45HIINN"#FJJ~r$B$BIXXQWXXI-i88J%h/=L'4N)*<=0F|Uc0d0d-N-MM*+++$%%))/000_$):!&0&=&=kUUCkCkZcdiZjZjCkCkCk
!.sF]ako/p/p/p!q!qE4(( e!.srTXbf/g/g/g!h!hE4(( eU ez%(UTXM7Z7Z e!.wU6::V^`aKbKb/c/c/c!d!d_$r   &+;+CH_k5111k6222%,DO?"RSSSo0"-DO-<<<<<lK,EFFF/[1IJJJhH(=>>>3[5LMMMMO))+++?**DO*66X666k8K+@AAA18M3JKKKnh{.CDDDlHX,>???2H=Q4RSSS F"" ,F9%4%>%[/U[B["$jj%@@$jj%@@#ZZ(8$???> 	<-;DO*-G-q4?KimqKq*""6"""!(DO 0040I-$^$^DD]$^$^$^!
 
 
 
 &';'A'A'C'C X X X
 
 

 t0557788;b;bTa;b;b;bb $(#;#B#B#D#D 	: 	:"*&''w66;NVc;c;c$$%8999 / 	, 	,E5zz((U--G-G$$U+++}!!FWW0H0O0O0Q0QWWW& % %eS)) -&ud;;;EEz22 - = -SZZ;K-K-K(,e$$$$ ('''	7799JJ" 	 	 	JJJ	 74?OT#R#R#^JJ{D)))7d7 $$^T:: !,"(**-@"A"A	 
  DO .X$/2PTX2X 	* - 	)&&(((((	) 	)s   [. .[=<[=rR   c                     dS )NTr1   r   s    r5   is_fastzTokenizersBackend.is_fast  s    tr7   c                     d| j         v r]| j         d                             d          r=t          | d          r+| j        r$t          j                            | j                  S dS dS )z
        `bool`: Whether or not the slow tokenizer can be saved. For a sentencepiece based slow tokenizer, this
        can only be `True` if the original `"sentencepiece.model"` was not deleted.
        r%   rJ   FT)vocab_files_namesrs   rw   r%   re   rf   rg   r   s    r5   can_save_slow_tokenizerz)TokenizersBackend.can_save_slow_tokenizer  si     4111d6L\6Z6c6cdl6m6m1t\** 7t 7w~~do66654r7   save_directoryfilename_prefixc                    t           j                            |          s t                              d| d           d S t           j                            ||r|dz   ndt          d         z             }t           j                            | j                  t           j                            |          k    rt          | j        |           |fS )NzVocabulary path (z) should be a directory-r   r%   )
re   rf   isdirry   errorjoinVOCAB_FILES_NAMESabspathr%   r   )r   r   r   out_vocab_files       r5   save_vocabularyz!TokenizersBackend.save_vocabulary  s    w}}^,, 	LLT^TTTUUUFoM_s222QbcoQpp
 
 7??4?++rw~/N/NNNT_n555  r7   c                    | j         }| j        }|| j        rt          d          | j        }| j        }|| j        r	d| _        dS | j        r|dz   nd d| j        rd|z   dz   nd }| | j        rd|z   d	z   nd d
| j        rd|z   d	z   nd }g }| j        r|                    ||f           | j        r|                    ||f           t          j	        |||          | j
        _        dS )ze
        Updates the underlying post processor with the current `bos_token` and `eos_token`.
        Nz)add_bos_token = True but bos_token = NoneFz:0 r   z$A:0rC   z:0z:1z $B:1)singlepairspecial_tokens)r\   bos_token_idr   r   r]   eos_token_idr   rV   r	   TemplateProcessingr   rG   )r   bosr  eosr  r   r  r  s           r5   r   z'TokenizersBackend.update_post_processor  sk    n(;4-;HIIIn( ;4-;!&DF%)%7?S5[[Rww[_[mEucCiRVFVFVsuww  D0BJ39t++  D  Dgkgy  RBRUX[R[^bRbRb  @B  D  D 	7!!3"5666 	7!!3"5666)3)F^*
 *
 *
&&&r7   c                 $    t          | dd          S )Nr   Frr   r   s    r5   r   zTokenizersBackend.add_eos_token      t-u555r7   c                 $    t          | dd          S )Nr   Fr	  r   s    r5   r   zTokenizersBackend.add_bos_token  r
  r7   c                 f    t                               | d|           |                                  d S )Nr   object__setattr__r   r   values     r5   r   zTokenizersBackend.add_eos_token  3    4!15999""$$$$$r7   c                 f    t                               | d|           |                                  d S )Nr   r  r  s     r5   r   zTokenizersBackend.add_bos_token  r  r7   c                    g }| j                                         D ]j}|t          |t                    r|                    |           0t          |t
                    r%|                    t          |dd                     k| j        D ]g}t          |t                    r|                    |           -t          |t
                    r%|                    t          |dd                     h|r|                     |d           t          | dd          s| j	        j
        |                                  dS dS )a[  
        Post-initialization hook that runs after the tokenizer is fully set up.
        This is called by from_pretrained() after loading the tokenizer, which allows
        us to add any special tokens that may have been passed as AddedToken objects.

        Child classes should call super()._post_init() if they override this method.
        NTF)r   
normalized)r  r   )r   rQ   r?   r   rV   rD   r   r   rr   r   rG   r   )r   r   token_valuer;   s       r5   
_post_initzTokenizersBackend._post_init  sl    3::<< 	^ 	^K"+z22 ^$$[1111K-- ^$$ZTV[%\%\%\]]] / 	X 	XE%,, X$$U++++E3'' X$$ZtPU%V%V%VWWW 	@OOM$O???48$?? 	)4?CaCi&&((((( DjCir7   c                 8    | j                             d          S )zP
        `int`: Size of the base vocabulary (without the added tokens).
        Fwith_added_tokensr   r   r   s    r5   r   zTokenizersBackend.vocab_size  s    
 ---FFFr7   c                 8    | j                             d          S )NTr  )r   	get_vocabr   s    r5   r  zTokenizersBackend.get_vocab  s    ((4(@@@r7   c                 *    |                                  S rT   )r  r   s    r5   r/   zTokenizersBackend.vocab  s    ~~r7   c                 h    d t          | j                                        d           D             S )z
        Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
        optimisation in `self._added_tokens_encoder` for the slow tokenizers.
        c                 $    i | ]\  }}|j         |S r1   contentr3   vks      r5   r<   z:TokenizersBackend.added_tokens_encoder.<locals>.<dictcomp>       mmmA	1mmmr7   c                     | d         S r   r1   r4   s    r5   r   z8TokenizersBackend.added_tokens_encoder.<locals>.<lambda>      dhijdk r7   r   r   r   r   r   s    r5   r   z&TokenizersBackend.added_tokens_encoder  s9     nm0I0O0O0Q0QWkWk)l)l)lmmmmr7   c                 4    | j                                         S )z
        Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.

        Returns:
            `dict[str, int]`: The added tokens.
        )r   get_added_tokens_decoderr   s    r5   r   z&TokenizersBackend.added_tokens_decoder
  s     77999r7   c                 h    d t          | j                                        d           D             S )z
        Returns the added tokens in the vocabulary as a dictionary of token to index.

        Returns:
            `dict[str, int]`: The added tokens.
        c                 $    i | ]\  }}|j         |S r1   r!  r#  s      r5   r<   z5TokenizersBackend.get_added_vocab.<locals>.<dictcomp>   r&  r7   c                     | d         S r   r1   r(  s    r5   r   z3TokenizersBackend.get_added_vocab.<locals>.<lambda>   r)  r7   r   r*  r   s    r5   get_added_vocabz!TokenizersBackend.get_added_vocab  s9     nm0I0O0O0Q0QWkWk)l)l)lmmmmr7   c                     dS )zN
        Returns True, to avoid expensive `assert tokenizer` gotchas.
        Tr1   r   s    r5   __bool__zTokenizersBackend.__bool__"  s	     tr7   c                 8    | j                             d          S )zD
        Size of the full vocabulary with the added tokens.
        Tr  r  r   s    r5   __len__zTokenizersBackend.__len__(  s     ---EEEr7   c                     | j         S )zc
        `tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
        )r   r   s    r5   backend_tokenizerz#TokenizersBackend.backend_tokenizer.  s    
 r7   c                     | j         j        S )zU
        `tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
        )r   decoderr   s    r5   r8  zTokenizersBackend.decoder5  s    
 &&r7   Tr-   return_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosec	                 F   |	d| j         v }|	d| j         v }|r|j        |g|j        z   }	n|g}	t          t                    }
|	D ]}|
d                             |j                   |r |
d                             |j                   |r |
d                             |j                   |r |
d                             |j                   |r |
d                             |j	                   |r-|
d                             t          |j                             |
|	fS )a  
        Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
        of encodings, take care of building a batch from overflowing tokens.

        Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
        lists (overflows) of lists (tokens).

        Output shape: (overflows, sequence length)
        Ntoken_type_idsattention_mask	input_idsspecial_tokens_maskoffset_mappingr   )model_input_namesoverflowingr   r@   rV   idstype_idsrB  rD  offsetsr   )r   r-   r9  r:  r;  r<  r=  r>  r?  	encodingsencoding_dictr   s               r5   _convert_encodingz#TokenizersBackend._convert_encoding<  sV   ( !($48N$N! ($48N$N!$ 	#)=)I!
X%99II!
I#D)) 	; 	;A+&--ae444$ C./66qzBBB$ I./66q7GHHH) S34;;A<QRRR% B./66qyAAA ;h'..s15zz:::i''r7   r;   c                 L    | j                             |          }|| j        S |S rT   )r   token_to_idunk_token_id)r   r;   r   s      r5   #_convert_token_to_id_with_added_vocz5TokenizersBackend._convert_token_to_id_with_added_vock  s*    ++E22=$$r7   r   c                 P    | j                             t          |                    S rT   )r   id_to_tokenint)r   r   s     r5   _convert_id_to_tokenz&TokenizersBackend._convert_id_to_tokenq  s    **3u::666r7   
new_tokensc                 n    |r| j                             |          S | j                             |          S rT   )r   add_special_tokensr   )r   rV  r  s      r5   _add_tokenszTokenizersBackend._add_tokenst  s7     	B?55jAAA))*555r7   r  c                 6    | j                             |          S )aG  
        Returns the number of added tokens when encoding a sequence with special tokens.

        <Tip>

        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
        this inside your training loop.

        </Tip>

        Args:
            pair (`bool`, *optional*, defaults to `False`):
                Whether the number of added tokens should be computed in the case of a sequence pair or a single
                sequence.

        Returns:
            `int`: Number of special tokens added to sequences.
        )r   num_special_tokens_to_add)r   r  s     r5   r[  z+TokenizersBackend.num_special_tokens_to_addz  s    & 88>>>r7   rH  skip_special_tokensc                 <   t          |t                    r| j                            |          S g }|rt	          | j                  nt	                      }|D ]C}t          |          }||v r|                    | j                            |                     D|S )a  
        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
        added tokens.

        Args:
            ids (`int` or `list[int]`):
                The token id (or token ids) to convert to tokens.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.

        Returns:
            `str` or `list[str]`: The decoded token(s).
        )r?   rT  r   rS  r{   all_special_idsrV   )r   rH  r\  r   ids_to_skipr   s         r5   convert_ids_to_tokensz'TokenizersBackend.convert_ids_to_tokens  s     c3 	4?..s3333FQc$.///CEE 	> 	>EJJE##MM$/55e<<====r7   textrX  c                 H     | j         d|||d|                                S )N)ra  	text_pairrX  r1   )_encode_plusr   )r   ra  r  rX  r   s        r5   tokenizezTokenizersBackend.tokenize  s2     t lddOallekllssuuur7   padding_strategyr   r   r   r   r   c                    | j         j        | j         j        }|t          j        k    r| j                                          n<|||j        | j        d}d}	nfd|D             }	|	|k    r | j         j        di | |t          j
        k    r|| j                                          dS dS |t          j        k    r|nd}
|
||n| j        | j        | j        | j        |d}||k    r | j         j        di | dS dS )a  
        Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
        library) and restore the tokenizer settings afterwards.

        The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
        padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
        section.

        Args:
            padding_strategy ([`~utils.PaddingStrategy`]):
                The kind of padding that will be applied to the input
            truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
                The kind of truncation that will be applied to the input
            max_length (`int`):
                The maximum size of a sequence.
            stride (`int`):
                The stride to use when handling overflow.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
            padding_side (`str`, *optional*):
                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
                Default value is picked from the class attribute of the same name.
        N)r   r   r   r   c                 >    i | ]}|                     |d           S rT   rn   )r3   r%  r   s     r5   r<   z@TokenizersBackend.set_truncation_and_padding.<locals>.<dictcomp>  s)    GGG11kooa66GGGr7   )r   r   pad_idrZ   r   r   r1   )r   r   r   r   DO_NOT_TRUNCATEr   r  r   r   r   
DO_NOT_PAD
no_padding
MAX_LENGTHr   pad_token_idrZ   r   r   )r   rf  r   r   r   r   r   r   targetcurrentr   r   s              @r5   set_truncation_and_paddingz,TokenizersBackend.set_truncation_and_padding  si   B o0?*"4"DDD&--/// ) /5!1	 F "GGGGGGG&  11;;F;;;999#**,,,,, $# $47Q#Q#QZZW[F -9-E\\4K\+!^#5&8 F 6!!..8888888 "!r7   r   rc  is_split_into_wordsreturn_tensorsr   c                    # d } ||          st          d          | ||          st          d          |rAt          |t          t          f          o#|o!t          |d         t          t          f          }nt          |t          t          f          }|rt          |t                    rt          d          |Pt          |          t          |          k    r0t          dt          |           dt          |           d          |t          t          ||                    n|}n
|r||fgn|g}t          |t          t          f          s t          dt          |           d	           	                    |||||	|

           | j
        } j        j        |k    r| j        _         j                            |||          } fd|D             }i }|d         d         D ]##fd|D             }||#<   d |D             }r;g }t          |          D ]$\  }\  }} ||gt          |d                   z  z  }%||d<   |d         D ]}!                     |!|           t!          |||          }"|s5|3s1t!          d |"                                D             |"j                  }"|"S )Nc                 r   t          | t                    rdS t          | t          t          f          rt	          |           dk    rdS t          | d         t                    rdS t          | d         t          t          f          rt	          | d                   dk    s!t          | d         d         t                    rdS t          | d         d         t          t          f          rFt	          | d         d                   dk    p&t          | d         d         d         t                    S dS dS dS )NTr   F)r?   rD   r@   r2   r   )r   s    r5   _is_valid_text_inputz<TokenizersBackend._encode_plus.<locals>._is_valid_text_input  s   !S!! tAe}-- q66Q;;4!c** 
!4!tUm44 !1Q4yyA~~AaDGS)A)A~#t#AaDGdE];; %"1Q47||q0OJqtAwqz34O4OO$u 5ur7   ztext input must be of type `str` (single example), `list[str]` (batch or single pretokenized example) or `list[list[str]]` (batch of pretokenized examples) or `list[tuple[list[str], list[str]]]` (batch of pretokenized sequence pairs).r   zdwhen tokenizing batches of text, `text_pair` must be a list or tuple with the same length as `text`.zbatch length of `text`: z- does not match batch length of `text_pair`: .z:batch_text_or_text_pairs has to be a list or a tuple (got ))rf  r   r   r   r   r   )rX  is_pretokenizedc                 J    g | ]}                     |	            S ))r-   r9  r:  r;  r<  r=  r>  r?  )rM  )
r3   r-   r:  r>  r=  r;  r<  r9  r   r?  s
     r5   r6   z2TokenizersBackend._encode_plus.<locals>.<listcomp>d  sX      
  
  
  ""!&;&;*C+E'=+ # 	 	 
  
  
r7   c                 0    g | ]\  }}|         D ]}|S r1   r1   )r3   r4   r   r   r   s       r5   r6   z2TokenizersBackend._encode_plus.<locals>.<listcomp>u  s.    NNN74DINNqQNNNNr7   c                 "    g | ]\  }}|D ]}|S r1   r1   )r3   r   r4   r   s       r5   r6   z2TokenizersBackend._encode_plus.<locals>.<listcomp>w  s)    SSSWQdSSqSSSSr7   rC  overflow_to_sample_mapping)tensor_typec                     i | ]>\  }}|t          |          d k    r#t          |d          t                    r|d          n|?S r>   )r   r?   r@   )r3   r   r  s      r5   r<   z2TokenizersBackend._encode_plus.<locals>.<dictcomp>  sW       "U c%jj1nnE!Hd9S9Sn%((Y^  r7   )r   r?   r@   r2   rD   	TypeErrorr   ziptyperr  r   r   r   encode_batchrq   &_eventual_warn_about_too_long_sequencer   r   rK  )$r   ra  rc  rX  rf  r   r   r   rs  r   r   rt  r9  r:  r;  r<  r=  r>  r?  r   r   rw  
is_batchedbatch_text_or_text_pairsrK  tokens_and_encodingssanitized_tokensstacksanitized_encodingsr~  r:   toksr   rC  batched_outputr   s$   `           ```````                @r5   rd  zTokenizersBackend._encode_plus  s   0	 	 	( $#D)) 	W  
  )=)=i)H)H W    	9#D4-88hThjQUVWQX[_afZgFhFhJJ#D4-88J 	T)S))    $Tc)nn)D)D *s4yy * *I* * *   FOEZtCi,@,@'A'A'A`d$$ ?H'Sy(9':':dV$ 2UDMBB 	nTRjMkMknnn   	''- 3!1% 	( 	
 	
 	
  '#'#< ?04HHH4HDO1 O00$1/ 1 
 
	 
  
  
  
  
  
  
  
  
  
  
 & 
  
  
 '*1- 	* 	*CNNNN&:NNNE$)S!!SS0DSSS % 	X)+& )*> ? ? K K9D!*qcC[8I4J4J.JJ**=W9:)+6 	X 	XI77	:wWWWW&'79LZhiii  	n4=V4* &4&:&:&<&<   ( N r7   r   c                     | j         j        | j         j                            |          nd                    |          S )NrC   )r6  r8  decoder   )r   r   s     r5   convert_tokens_to_stringz*TokenizersBackend.convert_tokens_to_string  s@     %-9 "*11&999&!!	
r7   	token_idsclean_up_tokenization_spacesc                    |                     dd            t          |t                    r|g}t          |t                    r|d         }| j                            ||          }||n| j        }|rt          | d          r*t          | j	                  r| 	                    |          }n|
                    dd          
                    dd          
                    d	d
          
                    dd          
                    dd          
                    dd          
                    dd          
                    dd          
                    dd          
                    dd          }|S )Nuse_source_tokenizerrC  )r\  clean_up_tokenizationz .rx  z ??z !!z ,,z ' 'z n'tzn'tz 'mz'mz 'sz'sz 'vez'vez 'rez're)rd   r?   rT  rc   r   r  r  rw   callabler  replace)r   r  r\  r  r   ra  s         r5   _decodezTokenizersBackend._decode  sn    	

)4000i%% 	$"Ii&& 	/!+.I%%iEX%YY ,7 )(2 	%
 ( 	t455 (4C]:^:^ 11$77 LLs++WT3''WT3''WT3''WUC((WVU++WUD))WUD))WVU++WVU++  r7   
file_names.legacy_formatc                     t          |          }t          j                            ||r|dz   ndt          z             }| j                            |           ||fz   }|S )Nr   r   )rD   re   rf   r   TOKENIZER_FILEr6  save)r   r   r  r  r   r$   s         r5   _save_pretrainedz"TokenizersBackend._save_pretrained  sl     ^,,oM_s222Q__
 
 	##N333>"33
r7   c           	         t          j        | j                                                  }|                    d          }|                    d          }	d}
|d         d         dk    ri |d         d<   g |d         d<   n|d         d         d	k    r^|d         d
         O|d         d
         }|d         d         |         d         }
|
v r|
         }
d|d         d
<   |
dgg|d         d<   n;|d         d         dv ri |d         d<   nt          d|d         d          d          7d|d         v r-|d         d         v r|d         d                  |d         d<   t          j        t          j        |                    g }|D ]}|                    dd          }|                    dd          }|d         d         d	k    r|sC|d         v r|d                  |d<   |	                    t          d*i |           ||                    |           |d         d         dk    r#d|vr|d         d         |d         d         |d<   |d         d         dk    r#d|vr|d         d         |d         d         |d<   |d         d         d	k    r|
|
|d<   |d         t|d         d         dk    sA|d         d         dk    rPd|d         v rFt          d |d         d         D                       r!t          j                                        |d<   t           |d         d                  } |d*||d|}                    |||           |	,t          j                                                  }d|	v r|	d         D ]}|	d         |         d         }fd |D             }||	d         |         d<   |D ](}                    |          }|t          d!          )fd"|D             |	d         |         d#<   d$D ]L}||	v rF|	|         \  }}|v r|         }                    |          }|t          d!          ||g|	|<   M|	|d<   t          j        t          j        |                    | j                                        }t*          j        D ]}t/          | |          t/          | |          }|v r|         }| j                            |d          }t5          |t                    r-t          ||j        |j        |j        |j        d%&          ||<   |||<   | j        r| j                                        ng }||                    |           tA          |          dk    r||d'<   |d(<   	  | j!        d*i |S # tD          $ rH}d)tG          |          v r1|                    d(d            | j!        d*i |}|_        |cY d}~S  d}~ww xY w)+uf  
        Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
        as the current one.

        Args:
            text_iterator (generator of `list[str]`):
                The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts
                if you have everything in memory.
            vocab_size (`int`):
                The size of the vocabulary you want for your tokenizer.
            length (`int`, *optional*):
                The total number of sequences in the iterator. This is used to provide meaningful progress tracking
            new_special_tokens (list of `str` or `AddedToken`, *optional*):
                A list of new special tokens to add to the tokenizer you are training.
            special_tokens_map (`dict[str, str]`, *optional*):
                If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
                token name to new special token name in this argument.
            kwargs (`dict[str, Any]`, *optional*):
                Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.

        Returns:
            [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
            `text_iterator`.

        added_tokensrG   Nr.   r  r   r/   rA   r   r   r   g        )r"   r#   z;This method does not support this type of tokenizer (found z-) only BPE, Unigram, WordLevel and WordPiece.r[   r   idr"  continuing_subword_prefixend_of_word_suffixr   	ByteLevelSequencepretokenizersc              3   .   K   | ]}|d          dk    V  dS )r  r  Nr1   )r3   pretokenizers     r5   	<genexpr>z<TokenizersBackend.train_new_from_iterator.<locals>.<genexpr>;  s@        $ !(K7     r7   initial_alphabet)r   r  )r   trainerr  r   c                 <    g | ]}                     ||          S r1   ri  )r3   r;   special_tokens_maps     r5   r6   z=TokenizersBackend.train_new_from_iterator.<locals>.<listcomp>M  s*    ![![![5"4"8"8"F"F![![![r7   zQAttempted to set a token in the post processor that does not exist in the mappingc                 :    g | ]}                     |          S r1   )rO  )r3   r;   r   s     r5   r6   z=TokenizersBackend.train_new_from_iterator.<locals>.<listcomp>V  s)    CuCuCuejIDYDYZ_D`D`CuCuCur7   rH  )r}   sepT)single_wordlstriprstripr  r   rP   r*   z7multiple values for keyword argument 'tokenizer_object'r1   )$rl   loadsr   to_strrd   r   ri   from_strdumpsrV   r   rU   anypre_tokenizers_fastr  alphabetMODEL_TO_TRAINER_MAPPINGtrain_from_iteratorrO  r   r   r   SPECIAL_TOKENS_ATTRIBUTESrr   r   rn   r?   r  r  r  r  rP   r   r   r  rD   )r   text_iteratorr   r   new_special_tokensr  r   r   r  rG   r[   r   r  added_tokenr   r   trainer_classr  trained_tokenizer_jsonr   r   r;   token_idspecial_tokenspecial_token_fullrP   r   new_tokenizerr   s        `                      @r5   train_new_from_iteratorz)TokenizersBackend.train_new_from_iterator  s   D DO$:$:$<$<==%)).99'++,<==	'"6*e33/1N7#G,02N7#H--G$V,	99g&x0<'0:*73G<VDQG	%1iCU6U6U 29 =I45w'15>4D3Ew'0G$V,0JJJ/1N7#G,,>n]dNeflNm > > >   *~g666w'48JJJ3EnU\F]^iFj3kN7#K0!*4:n+E+EFF	 ' 	= 	=K!ooi66Gd++Ag&v.);;G;!-+i2HL^2^2^);K	<R)SI&!!*";";{";";<<<<)!!"4555 7#F+u44+699w'(CDP2@2IJe2fF./7#F+u44$F22w'(<=I+9'+BCW+XF'('"6*i77I<Q"+F;/*6/7;FF!/26:jHH#~o'FFF  (6(G(X     G .A-J-S-S-U-U)*01H1PQ-_:n__X^__%%mFG%TTT%%)Z	0@0@0B0B%C%C">11)*:; v vC+,<=cB8LF)5![![![![TZ![![![FLN#34S9(C!'  #,#8#8#?#?#+", s# #  ,
 DvCuCuCuntCuCuCuN#34S9%@@!/ 
F 
F N22-m<HE1)5%CU:U:U 25 9(44U;;H'(o   6;H4EN=17E"#34%.tz:P/Q/QRRI!&&((,F 	2 	2EtU##/ 'e 4 4%1mGY6Y6Y$6}$EM%)%=%A%A%%N%N"0*== 2$.%$6$B1818#5#@ $% % %F5MM %2F5M DHC\dt8==???bd) ''(:;;;#$$q((-AF)* &/!"	!4>++F+++ 	 	 	HCPQFFRR 

-t444 . 8 8 8 8+4($$$$$$$ 	s$   !V. .
X 8<W;4X :W;;X c
           
      2   ddl ddlm ddlm} ddlm} dt          dt          ffd}t                      rd	}|V|s|sQ ||          rE ||d
|||dd|          }d}|t          |d          5 }t          j        |          }ddd           n# 1 swxY w Y   |                    d          }|                    d          }|r7|                    |          |                    d          k    r|r||dvr|S n0|r.|                    |          |                    d          k    r|S d	}|s|sW ||          rK|rd|v rt          |d|d                    |	At!          |dd          s0t          |dd           t"                              d| d           n|	d	u st!          |dd          rt          |dd	           ddl}|j                            |                    d          d          }|j        j        }t3          ||j        j                  r||j        j        d<   n\t3          ||j        j                  r|j                            dd          }|j                            ||g          |j        _        |S )af  
        Patches mistral related tokenizers with incorrect regex if detected
            1) Local file with an associated config saved next to it
                >> Model type one of the mistral models (on older versions)
            2) Remote models on the hub from official mistral models
                >> Tags including `base_model:.*mistralai`
        r   N)
model_info)versionr   model_idrR   c                      |           }|j         0                    dd                    |j                             rdS dS )Nzbase_model:.*mistralair   TF)tagssearchr   )r  r.   r  res     r5   is_base_mistralz?TokenizersBackend._patch_mistral_regex.<locals>.is_base_mistral  sI    Jx((Ez%995rwwuz7J7JKK  45r7   Tzconfig.jsonF)	cache_dirr;   local_files_only%_raise_exceptions_for_missing_entries'_raise_exceptions_for_connection_errors_commit_hashr+   r,   transformers_versionr   z4.57.2)mistralmistral3voxtral	ministralpixtralz4.57.3r   z$The tokenizer you are loading from 'a  ' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.z[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+isolated)patternbehavior)r   	use_regex)r  huggingface_hubr  	packagingr  transformers.utils.hubr   rD   boolr   rk   rl   rm   rn   parsesetattrrr   ry   rz   r   pre_tokenizersSplitRegexr6  r   r?   r  	Metaspacer  )r}   r   pretrained_model_name_or_pathr;   r  r  r  is_localr   r   r   r  r   r  _config_filemistral_config_detectedf_configr  transformers_model_typer   split_pretokenizercurrent_pretokenizerr  r  s                          @@r5   r   z&TokenizersBackend._patch_mistral_regex  s   * 				......%%%%%%666666	c 	d 	 	 	 	 	 	 	  	H(4 5% 5*9/:W*X*X 5 ';-#!16;8=)	 	 	L ',#',999 +Q"illG+ + + + + + + + + + + + + + +'.{{3I'J'J$*1++l*C*C'
 ( %GMM:N,O,OSZS`S`aiSjSj,j,j )3?3    )() %gmm<P.Q.QT[TaTabjTkTk.k.k$$*.'& *x *OOLi<j<j * ^#6+#E#EI':KH[<\]]] %,WYH[]b5c5c,I':EBBBNNe?\ e e e   
 '$..')EXZ_2`2`.I':DAAA%%%%)3)B)H)H * 0 0 s! ! ",	 *I * *& ,5+F+T(!"6
8Q8Z[[ GY	3A!DD &&:J<U<_`` 3=3L3V3V16% 4W 4 40
 EOD]DfDf 2 4E E	3A s   :BB"B)FrT   )NNFFFFT)NF)FN)NN)NNN)NNFNFNN)Crp   
__module____qualname____doc__r   r   r.   r   classmethodr   r)   propertyr  r   r   rD   r2   r   r   r   r   setterr  rT  r   rc   r  r/   r   r   r   _added_tokens_encoder_added_tokens_decoderr0  r2  r4  ri   r6  DecoderFastr8  EncodingFastr   r@   rM  rQ  rU  rY  r[  r`  re  r   r   rr  rl  rk  r   r   r   rd  r  r  re   PathLiker  r  r   __classcell__)r   s   @r5   r'   r'   R   sO       
 
 *EJC C C [CJZ) Z) Z) Z) Z)x     X     X! !c !C$J !Z_`cZd ! ! ! !
 
 
: 6 6 X6 6 6 X6 % % % % % %) ) )@ GC G G G XGA4S> A A A A  tCH~       X  nd38n n n n Xn :d3
?&; : : : X: 10nc3h n n n n$    F F F F F =    X ' ' ' ' X' .2-1*/+0',#-( -(-(  $d{-(  $d{	-(
 $(-( %)-( !%-( -( -( 
tCH~tL11	2-( -( -( -(^     7# 7#* 7 7 7 76 6d3+;&< 6WZ 6 6 6 6? ?d ?s ? ? ? ?* tCy t `cfjknfo`o    4v vS vd
 vt vjnorjs v v v vI9)I9 0I9 	I9
 I9  $JI9 DjI9 I9 I9 I9\ gk#',;,F2D2T!%$))-#'&*-1-1*/+0',#,0)X X++d9o=EV@WWX 004	?BTJ[E\\_ccX !	X
 *X 0X $JX X "X  $JX DjX tX  $d{X  $d{X $(X  %)!X" !%#X$ %X& 'X( #Tk)X, 
-X X X Xt
tCy 
S 
 
 
 
 %*48	( (c?( "( '+Tk	( 
( ( ( (\ &*&* bk) #s(O d{	
 t 
sCx   * C C C CJ 
 } } } [} } } } }r7   r'   )Ar  r   rl   re   collectionsr   collections.abcr   shutilr   typingr   tokenizers.pre_tokenizersr  r  r  r   r   r   r	   r
   r  r   ri   tokenizers.decodersr   r  tokenizers.modelsr   r   tokenizers.trainersr   r   r   r   r  r   integrations.ggmlr   modeling_gguf_pytorch_utilsr   tokenization_utils_baser   r   r   r   r   r   r   utilsr   r    r!   
get_loggerrp   ry   r  SPECIAL_TOKENS_MAP_FILETOKENIZER_CONFIG_FILETIKTOKEN_VOCAB_FILEADDED_TOKENS_FILEr  r   r'   PreTrainedTokenizerFastr1   r7   r5   <module>r     s   
   				 # # # # # # $ $ $ $ $ $             7 7 7 7 7 7 + + + + + + - - - - - - - - / / / / / / 1 1 1 1 1 1 6 6 6 6 6 6 * * * * * * * * ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ . . . . . . 5 5 5 5 5 5 = = = = = =                  @ ? ? ? ? ? ? ? ? ? 
	H	%	% "3 / '  (      !!	   (6EXYY  ,--C C C C C/ C C .-CN& ,   r7   