
    bi                         d dl Z d dlZd dlZddlmZ ddlmZ ddlmZ  ej	        e
          Z G d d          Z G d d	e          ZdS )
    N   )TrainerCallback)PREFIX_CHECKPOINT_DIR)loggingc                   4    e Zd Zd	defdZd Zd Zd Zd ZdS )
CheckpointManager   	kill_waitc                 >    || _         d| _        d| _        || _        dS )aD  
        Initialize the CheckpointManager for Just-In-Time checkpoint handling.

        Args:
            trainer: The Trainer instance that will be used to save checkpoints when SIGTERM is received.
            kill_wait (`int`, *optional*, defaults to 3): Grace period to distinguish between SIGTERM and SIGKILL.
        FN)traineris_checkpoint_requested_original_sigterm_handlerr
   )selfr   r
   s      ^/root/projects/butler/venv/lib/python3.11/site-packages/transformers/trainer_jit_checkpoint.py__init__zCheckpointManager.__init__   s&     ',$)-&"    c                     t          j         t           j        | j                  | _        t                              d           d S )Nz4JIT checkpoint signal handler registered for SIGTERM)signalSIGTERM_sigterm_handlerr   loggerinfor   s    r   setup_signal_handlerz&CheckpointManager.setup_signal_handler   s4    )/v~tG\)])]&JKKKKKr   c                     | j         rd S t                              d| j         d           t	          j        | j        | j                                                   d S )Nz4SIGTERM received, will request JIT checkpoint after s)r   r   r   r
   	threadingTimer_enable_checkpointstart)r   signumframes      r   r   z"CheckpointManager._sigterm_handler   s[    ' 	F\4>\\\]]](?@@FFHHHHHr   c                 H    t                               d           d| _        d S )Nz/Kill wait period elapsed, requesting checkpointT)r   r   r   r   s    r   r   z$CheckpointManager._enable_checkpoint&   s#    EFFF'+$$$r   c                    	 d| _         t                              d           | j        j        j        }t                              d|            | j                            d           }t           d| }t          j	        
                    ||          }t          j        |d           t          j	        
                    ||d          }t          |d	          5 }|                    d
| d           d d d            n# 1 swxY w Y   t                              d|            | j                            | j        j        d            t          j	                            |          r.t          j        |           t                              d           t                              d           d S # t$          $ r#}t                              d|             d }~ww xY w)NFzStarting JIT checkpointing...zSaving JIT checkpoint at step )trial-T)exist_okzcheckpoint-is-incomplete.txtwzCheckpoint started at step z and in progress...z2Created checkpoint progress sentinel marker file: zSentinel marker file removedz/Immediate JIT checkpoint completed successfullyzFailed to save JIT checkpoint: )r   r   r   r   stateglobal_step_get_output_dirr   ospathjoinmakedirsopenwrite_save_checkpointmodelexistsremove	Exceptionerror)r   current_step
output_dircheckpoint_foldercheckpoint_pathsentinel_filefes           r   execute_jit_checkpointz(CheckpointManager.execute_jit_checkpoint*   s   !	+0D(KK7888<-9LKKGGGHHH55D5AAJ#8 I I< I I gll:7HIIO K$7777 GLL5FHfggMmS)) YQWlWWWXXXY Y Y Y Y Y Y Y Y Y Y Y Y Y YKK\]\\]]] L))$,*<D)III w~~m,, <	-(((:;;;KKIJJJJJ 	 	 	LL>1>>???	s=   CF< D9F< D		F< D	B-F< <
G)G$$G)N)r	   )	__name__
__module____qualname__intr   r   r   r   r?    r   r   r   r      sv        # #3 # # # #L L LI I I, , ," " " " "r   r   c                   <    e Zd ZdZd Zd Zd Zd Zd Zd Z	d Z
d	S )
JITCheckpointCallbackaN  
    Callback for Just-In-Time checkpointing on SIGTERM signals.

    When SIGTERM is received, the checkpoint manager sets `is_checkpoint_requested=True`.
    The callbacks detect this flag and set `control.should_training_stop=True`, which signals
    the Trainer's training loop to exit gracefully after saving the checkpoint.
    c                 "    d | _         d | _        d S )N)r   jit_managerr   s    r   r   zJITCheckpointCallback.__init__X   s    59r   c                     || _         |j        j        rJt          |          | _        | j                                         t                              d           d S d S )N)r   zJIT checkpointing enabled)r   argsenable_jit_checkpointr   rH   r   r   r   )r   r   s     r   set_trainerz!JITCheckpointCallback.set_trainer\   sa    <- 	50AAAD11333KK344444	5 	5r   c                 t    | j         r.| j         j        r$d|_        | j                                          d S d S d S NTrH   r   should_training_stopr?   r   rJ   r)   controlkwargss        r   on_pre_optimizer_stepz+JITCheckpointCallback.on_pre_optimizer_stepc   Q     	6 0 H 	6+/G(3355555	6 	6 	6 	6r   c                 t    | j         r.| j         j        r$d|_        | j                                          d S d S d S rN   rO   rQ   s        r   on_step_beginz#JITCheckpointCallback.on_step_beginh   rU   r   c                     | j         r5| j         j        r+d|_        d|_        | j                                          d S d S d S NFTrH   r   should_saverP   r?   rQ   s        r   on_step_endz!JITCheckpointCallback.on_step_endm   Y     	6 0 H 	6"'G+/G(3355555	6 	6 	6 	6r   c                     | j         r5| j         j        r+d|_        d|_        | j                                          d S d S d S rY   rZ   rQ   s        r   on_epoch_endz"JITCheckpointCallback.on_epoch_ends   r]   r   c                     | j         rQ| j         j        Gt          j        t          j        | j         j                   t                              d           d S d S d S )Nz;Restored original SIGTERM handler after training completion)rH   r   r   r   r   r   rQ   s        r   on_train_endz"JITCheckpointCallback.on_train_endy   s^     	W 0 J VM&.$*:*TUUUKKUVVVVV	W 	W V Vr   N)r@   rA   rB   __doc__r   rL   rT   rW   r\   r_   ra   rD   r   r   rF   rF   O   s         : : :5 5 56 6 6
6 6 6
6 6 66 6 6W W W W Wr   rF   )r,   r   r   trainer_callbackr   trainer_utilsr   utilsr   
get_loggerr@   r   r   rF   rD   r   r   <module>rg      s    				      - - - - - - 0 0 0 0 0 0       
	H	%	%? ? ? ? ? ? ? ?D.W .W .W .W .WO .W .W .W .W .Wr   