
    bij                     l    d dl mZmZmZmZmZ d dlmZ d dlm	Z	 d dl
mZ  G d d          ZdefdZd	S )
    )Regex	Tokenizerdecoderspre_tokenizers
processors)BPE)bytes_to_unicode)PreTrainedTokenizerFastc                   @    e Zd ZdZ	 	 	 	 ddZdefdZd Zd	efd
Z	dS )MistralConverterz'
    A general tiktoken converter.
    Ns(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+Fc                 >    || _         || _        || _        || _        d S )N)vocabpatternadd_prefix_spaceadditional_special_tokens)selfr   r   r   r   kwargss         \/root/projects/butler/venv/lib/python3.11/site-packages/transformers/integrations/mistral.py__init__zMistralConverter.__init__   s(     
 0)B&&&    r   c                 H  
 |
t                      fdg }i }t          
                                          D ]\  }\  }}|| j        vr|| |          <   t	          |          dk    r3g }t          dt	          |                    D ]=}|d |         ||d          }	}|
v r#|	
v r||	z   
v r|                    ||	|f           >t          |
fdd          }|                    |           |||<   t          |d d          }fd|D             }||fS )Nc                 l    d                     fd|                     d          D                       S )N c                 :    g | ]}t          |                   S  )ord).0charbyte_encoders     r   
<listcomp>zcMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string.<locals>.<listcomp>   s$    TTTLT3TTTr   zlatin-1)joindecode)br    s    r   token_bytes_to_stringzOMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string   s6    77TTTT@S@STTTUUUr      c                 <    | d                  | d                  fS )Nr   r&   r   )x	bpe_rankss    r   <lambda>zBMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>-   s    Yqt_iPQRSPTo4V r   F)keyreversec                     | d         S )N   r   )vals    r   r*   zBMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>1   s
    A r   c                 T    g | ]$} |d                     |d                   f%S )r   r&   r   )r   r/   r%   s     r   r!   zDMistralConverter.extract_vocab_merges_from_model.<locals>.<listcomp>2   s?    cccUX((Q002G2GA2O2OPcccr   )	r	   	enumerateitemsr   lenrangeappendsortedextend)r   r   mergesidxtokenranklocalindexpiece_lpiece_rr)   r    r%   s             @@@r   extract_vocab_merges_from_modelz0MistralConverter.extract_vocab_merges_from_model   s   	'))	V 	V 	V 	V 	V "+IOO,=,=">"> 	# 	#C%D:::69++E223u::??"1c%jj11 ? ?E',VeV}eEFFmWG)++90D0D'T[J[`iIiIigw%=>>>u*V*V*V*V`efffe$$$$"e$6$6FFFcccc\bcccf}r   c                     |                      | j                  \  }}t          t          ||d                    }t	          |j        d          rd|j        _        |S )NF)fuse_unkignore_mergesT)r@   r   r   r   hasattrmodelrC   )r   vocab_scoresr8   	tokenizers       r   rG   zMistralConverter.tokenizer5   s\    #CCDJOOfc,GGGHH	9?O44 	1,0IO)r   returnc                 |   |                                  }t          j        t          j        t	          | j                  dd          t          j        | j        d          g          |_        t          j                    |_
        |                    | j                   t          j        d          |_        |S )NisolatedF)behaviorinvert)r   	use_regex)trim_offsets)rG   r   SequenceSplitr   r   	ByteLevelr   pre_tokenizerr   decoderadd_special_tokensr   r   post_processor)r   rG   s     r   	convertedzMistralConverter.converted<   s    NN$$	"0"9$U4<%8%8:V[\\\($:O[`aaa#
 #
	 %.00	$$T%CDDD#-#7U#K#K#K	 r   )Nr   FN)
__name__
__module____qualname____doc__r   strr@   rG   r   rV   r   r   r   r   r      s           K"&C C C CS    6  9      r   r   tokenizer_filec                    ddl m} ddlm} |                    |           }|j        j        j        }t          |j        j        j	        d           }d |D             }d t          |          D             }|                    |           |}|j        j        j        j        }t          t          |||                                          	          }	|	                    d
|i           |j        j        |j        j        |j        j        |j        j        d}
|
                                D ] \  }}||v r|	                    ||i           !|	S )z1Convert a "tekken" tokenizer to a fast Tokenizer.r   )SpecialTokens)MistralTokenizerc                     | d         S )Nr;   r   )r(   s    r   r*   z*convert_tekken_tokenizer.<locals>.<lambda>X   s    mnoumv r   )r+   c                     g | ]
}|d          S )	token_strr   )r   r:   s     r   r!   z,convert_tekken_tokenizer.<locals>.<listcomp>Y   s    AAA%5%AAAr   c                     i | ]\  }}||	S r   r   )r   r9   r:   s      r   
<dictcomp>z,convert_tekken_tokenizer.<locals>.<dictcomp>[   s    KKKjc5ucKKKr   )r   r   r   )tokenizer_objectr   )	bos_token	eos_token	pad_token	unk_token)%mistral_common.tokens.tokenizers.baser^   (mistral_common.tokens.tokenizers.mistralr_   	from_fileinstruct_tokenizerrG   _tekken_token2id_nospecialr6   _all_special_tokensr1   update_model_pat_strr
   r   rV   rT   bosvalueeospadunkr2   )r\   r^   r_   mistral_tokenizerr   sorted_tokensall_specialspecials_tokensr   rG   
MAP_SPECALspecial_keyspecial_tokens                r   convert_tekken_tokenizerr   L   s    DCCCCCIIIIII )22>BB 0:UE,?I]cvcvwwwMAA=AAAKKKIk4J4JKKKO5!!!E  2<CLG ();
 
 

)++  I   "={!KLLL #&,"&,"&,"&,	 J '1&6&6&8&8 G G"]K''((+})EFFFr   N)
tokenizersr   r   r   r   r   tokenizers.modelsr   #transformers.convert_slow_tokenizerr	   *transformers.tokenization_utils_tokenizersr
   r   r[   r   r   r   r   <module>r      s    M M M M M M M M M M M M M M ! ! ! ! ! ! @ @ @ @ @ @ N N N N N NA A A A A A A AH-S - - - - - -r   