
    bi#                     n   d dl Z d dlZd dlmZmZ d dlmZ d dlZd dlm	Z	 d dl
mZ ddlmZ ddlmZ dd	lmZmZ d
dlmZmZmZmZ  ej        e          Z e ej                              Z ed eD                       Ze G d d                      Z  G d de          Z! G d de          Z"dS )    N)	dataclassfield)Enum)FileLock)Dataset   )$MODEL_FOR_QUESTION_ANSWERING_MAPPING)PreTrainedTokenizer)check_torch_load_is_safelogging   )SquadFeaturesSquadV1ProcessorSquadV2Processor"squad_convert_examples_to_featuresc              #   $   K   | ]}|j         V  d S N)
model_type).0confs     [/root/projects/butler/venv/lib/python3.11/site-packages/transformers/data/datasets/squad.py	<genexpr>r   !   s$      EEDOEEEEEE    c                       e Zd ZU dZ edddd                    e          z   i          Zee	d<    edddi          Z
ee	d	<    ed
ddi          Zee	d<    ed
ddi          Zee	d<    edddi          Zee	d<    edddi          Zee	d<    edddi          Zee	d<    edddi          Zee	d<    edddi          Zee	d<    edddi          Zee	d<    eddd i          Zee	d!<    ed"dd#i          Zee	d$<   dS )%SquadDataTrainingArgumentszb
    Arguments pertaining to what data we are going to input our model for training and eval.
    Nhelpz!Model type selected in the list: z, )defaultmetadatar   zFThe input data dir. Should contain the .json files for the SQuAD task.data_dir   zThe maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.max_seq_lengthzVWhen splitting up a long document into chunks, how much stride to take between chunks.
doc_stride@   zkThe maximum number of tokens for the question. Questions longer than this will be truncated to this length.max_query_length   zThe maximum length of an answer that can be generated. This is needed because the start and end predictions are not conditioned on one another.max_answer_lengthFz1Overwrite the cached training and evaluation setsoverwrite_cachezDIf true, the SQuAD examples contain some that do not have an answer.version_2_with_negativeg        zIIf null_score - best_non_null is greater than the threshold predict null.null_score_diff_threshold   n_best_sizer   zjlanguage id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)lang_id   z3multiple threads for converting example to featuresthreads)__name__
__module____qualname____doc__r   joinMODEL_TYPESr   str__annotations__r   r!   intr"   r$   r&   r'   boolr(   r)   floatr+   r,   r.    r   r   r   r   $   s          e(KdiiXcNdNd(de  J    E(pq  Hc     %Q
  NC    ers  J    "E/
  c    #UJ
  s    "E)\ ]  OT    %*E)o p% % %T    (-uv'rs( ( (u    uf&qr  K    5C
  GS    5f6k-lmmmGSmmmmmr   r   c                       e Zd ZdZdZdS )SplittraindevN)r/   r0   r1   r=   r>   r:   r   r   r<   r<   g   s        E
CCCr   r<   c                       e Zd ZU eed<   ee         ed<   eed<   eed<   dej	        dddfdede
d	edz  deez  ded
edz  defdZd Zdeeej        f         fdZdS )SquadDatasetargsfeaturesmodeis_language_sensitiveNFpt	tokenizerlimit_length	cache_dirdataset_formatc                 X   || _         || _        |j        rt                      nt	                      | _        t          |t                    r,	 t          |         }n# t          $ r t          d          w xY w|| _
        |j        rdnd}t          j                            ||n|j        d|j         d|j        j         d|j         d|           }	|	dz   }
t'          |
          5  t          j                            |	          r|j        st-          j                    }t/                       t1          j        |	d          | _        | j        d	         | _        | j                            d
d           | _        | j                            dd           | _        t>                               d|	 dt-          j                    |z
             | j        | j        t>          !                    d|	 d           n|t          j"        k    r%| j        #                    |j                  | _        n$| j        $                    |j                  | _        tK          | j        ||j        |j&        |j'        |t          j(        k    |j)        |          \  | _        | _        t-          j                    }t1          j*        | j        | j        | j        d|	           t>                               d|	 dt-          j                    |z
  dd           d d d            d S # 1 swxY w Y   d S )Nzmode is not a valid split namev2v1cached__z.lockT)weights_onlyrB   datasetexamplesz"Loading features from cached file z [took %.3f s]zDeleting cached file z; will allow dataset and examples to be cached in future run)rQ   rF   r!   r"   r$   is_trainingr.   return_dataset)rB   rP   rQ   z!Saving features into cached file z [took z.3fz s])+rA   rD   r(   r   r   	processor
isinstancer5   r<   KeyErrorrC   ospathr3   r   value	__class__r/   r!   r   existsr'   timer   torchloadold_featuresrB   getrP   rQ   loggerinfowarningr>   get_dev_examplesget_train_examplesr   r"   r$   r=   r.   save)selfrA   rF   rG   rC   rD   rH   rI   version_tagcached_features_file	lock_pathstarts               r   __init__zSquadDataset.__init__r   s    	%:"/3/Kc)+++QaQcQcdC   	AAT{ A A A?@@@A	":Ddd!w||".IIDMedjee9#6#?ee$BUeeXcee 
  
 )72	i   -	 -	w~~233 ,D<P ,	(***$)J/CRV$W$W$W! !% 1* =#044YEE $ 1 5 5j$ G G]9M]]]_c_h_j_jmr_r   <'4=+@NN&0D & & &  
 59$$$(N$C$CDM$R$RDMM$(N$E$Edm$T$TDM.P!]'#'#6#%)%: $ 3 L#1	/ 	/ 	/+t| 	
!%4<UYUbcc(  
 q8LqqUYU^U`U`chUhqqqq  W-	 -	 -	 -	 -	 -	 -	 -	 -	 -	 -	 -	 -	 -	 -	 -	 -	 -	s   A A5'H+LL#&L#c                 *    t          | j                  S r   )lenrB   )rg   s    r   __len__zSquadDataset.__len__   s    4=!!!r   returnc                 &   | j         |         }t          j        |j        t          j                  }t          j        |j        t          j                  }t          j        |j        t          j                  }t          j        |j        t          j                  }t          j        |j        t          j	                  }t          j        |j
        t          j	                  }|||d}	| j        j        dv r|	d= | j        j        dv r|	                    ||d           | j        j        r|	                    d|i           | j        rG|	                    dt          j        |j        t          j                  | j        j        z  i           | j        t*          j        k    rbt          j        |j        t          j                  }
t          j        |j        t          j                  }|	                    |
|d	           |	S )
N)dtype)	input_idsattention_masktoken_type_ids)xlmroberta
distilbert	camembertru   )xlnetrv   )	cls_indexp_maskis_impossiblelangs)start_positionsend_positions)rB   r]   tensorrs   longrt   ru   r{   r|   r9   r}   rA   r   updater(   rD   onesshapeint64r,   rC   r<   r=   start_positionend_position)rg   ifeaturers   rt   ru   r{   r|   r}   inputsr   r   s               r   __getitem__zSquadDataset.__getitem__   s   -"L!2%*EEE	g&<EJOOOg&<EJOOOL!2%*EEE	gnEK@@@W%:%+NNN #,,
 
 9#PPP'(9#333MM	VDDEEEy0 @>???) owIO5;)W)W)WZ^ZcZk)kmnnn9###l7+ATTTO!L)=UZPPPMMMoP]^^___r   )r/   r0   r1   r   r6   listr   r<   r8   r=   r
   r7   r5   rl   ro   dictr]   Tensorr   r:   r   r   r@   r@   l   s        
$$$$=!!!!
KKK $(!K&+ $"J J(J 'J Dj	J
 EkJ  $J :J J J J JX" " " S%,%6 7            r   r@   )#rW   r\   dataclassesr   r   enumr   r]   filelockr   torch.utils.datar   models.auto.modeling_autor	   tokenization_pythonr
   utilsr   r   processors.squadr   r   r   r   
get_loggerr/   ra   r   keysMODEL_CONFIG_CLASSEStupler4   r   r<   r@   r:   r   r   <module>r      s   
			  ( ( ( ( ( ( ( (              $ $ $ $ $ $ M M M M M M 6 6 6 6 6 6 6 6 6 6 6 6 6 6 t t t t t t t t t t t t 
	H	%	%tE@EGGHH eEE0DEEEEE ?n ?n ?n ?n ?n ?n ?n ?nD    D   
u u u u u7 u u u u ur   