
    bi*Z                    "   U d Z ddlmZ ddlZddlZddlmZ ddlm	Z	 ddl
mZmZ ddlZddlmZ ddlmZ d	d
lmZ d	dlmZ d	dlmZ d	dlmZmZ d	dlmZmZmZmZmZm Z m!Z!m"Z"m#Z# erd	dl$m%Z% d	dlm&Z&  ej'        e(          Z)e	 G d d                      Z*d>dZ+ee*ge,ee-e.ef         f         f         Z/d?dZ0	 d@dAd)Z1dBd+Z2dBd,Z3dBd-Z4dBd.Z5dBd/Z6dBd0Z7dBd1Z8dBd2Z9dBd3Z:dBd4Z;dBd5Z<dBd6Z=dBd7Z>dBd8Z?dBd9Z@dBd:ZAdBd;ZBejC        ejD        ejE        ejF        ejG        ejH        ejI        ejJ        ejK        ejL        ejM        ejN        ejO        ejP        ejQ        gZRejS        ejT        ejU        ejV        ejW        ejX        gZYejZ        ej[        gZ\ej]        ej^        gZ_ej`        eja        ejb        gZcejd        e2eje        e3ejf        e3ejg        e4ejh        e5eji        e6ejj        e8ejk        e9ejl        e:ejm        e;ejn        e?ejo        eBejp        e>ejq        e>ie-r                    eRe7          e-r                    eYe<          e-r                    e\e=          e-r                    e_e@          e-r                    eceA          Zsd<etd=<   dS )Cz,
Optimizer utilities for the Trainer class.
    )annotationsN)Callable)	dataclass)TYPE_CHECKINGAny)version)nn   )	Adafactor)LayerWiseDummyOptimizer)check_target_module_exists)OptimizerNamesParallelMode)	is_apollo_torch_availableis_bitsandbytes_availableis_galore_torch_availableis_grokadamw_availableis_lomo_availableis_schedulefree_availableis_torch_optimi_availableis_torchao_available	strtobool)PreTrainedModel)TrainingArgumentsc                  F    e Zd ZU dZded<   ded<   ded<   ded<   d	ed
<   dS )OptimizerContextz0Context object passed to all optimizer handlers.r   argszPreTrainedModel | Nonemodeldict[str, Any]optimizer_kwargsadam_kwargsdict[str, str]
optim_argsN)__name__
__module____qualname____doc____annotations__     Y/root/projects/butler/venv/lib/python3.11/site-packages/transformers/trainer_optimizer.pyr   r   6   sT         ::!!!!$$$$r*   r   optim_args_str
str | Nonereturnr"   c                    | si S i }|                      dd                              d          D ]}|                    d          \  }}|||<    |S )z8Parse optimizer arguments from a comma-separated string.  ,=)replacesplit)r,   r#   mappingkeyvalues        r+   _parse_optim_argsr9   A   sg     	J!))#r2288==    ]]3''
U
3r*   optimizer_cls_or_factoryr   boolc                r    t          | t                    r!t          | t          j        j                  rdS dS )a4  
    Check if the returned value from a handler is a factory rather than an Optimizer class.

    Factory callables are used for complex optimizers like Muon or Dion that need to:
    - Split parameters between multiple internal optimizers
    - Handle complex sharding logic
    - Access the full model structure for parameter grouping

    Args:
        optimizer_cls_or_factory: The first element returned by an optimizer handler.

    Returns:
        `bool`: True if it's not an Optimizer class (i.e., likely a factory), False if it's an Optimizer class.
    FT)
isinstancetype
issubclasstorchoptim	Optimizer)r:   s    r+   is_optimizer_factoryrC   P   s:      *D11 jAY[`[f[p6q6q u4r*   Tr   r   r   r   optimizer_namestroptimizer_mappingr   optim_kwargsr    is_layerwise_supportedtuple[Any, dict[str, Any]]c                   |                                                     d          }|r*| j        t          j        k    r|rt          d| d          ||         }| j        t          d| d          t          | j        t          t          f          st          d| j                   |t          d| d	          t          | j        t                    o| j                            d
d          dk    }	g |                                D ]{\  }
}t          | j        |
d          \  }}t          |t          j                  s%|r"|s t"                              |
 d| d           ^|s|	sc                    |
dz              |t)                    dk    rt          d| d| j         d          fd|                                D             }fd|                                D             }d|id|i|g}|r| j        dk    rt          d| d          i |D ]} |d|gigfi ||<   |D ]} |d|gi|gfi ||<   fd}|                                D ]}|j        r|                    |           t4          }|                    di           |                    d|i           ||fS )z
    Helper function to set up low-rank optimizers like GaLore and Apollo.

    These optimizers apply low-rank projections to specific target modules (typically linear layers).
    	layerwisezLayer-wise z" does not support DDP at this timeNz1You need to define `optim_target_modules` to use z optimizerszX`optim_target_modules` must be a list of strings, a regex string, or 'all-linear'. Got: z'You need to pass a model to initialize z optimizer._-z
all-linearT)return_is_regexz matched but ignored. z only supports linear layers.z.weightr   zNo target modules found for z (z).c                "    g | ]\  }}|v 	|S r)   r)   .0nptarget_params_namess      r+   
<listcomp>z-_setup_low_rank_optimizer.<locals>.<listcomp>   s(    XXX41aqDW?W?WQ?W?W?Wr*   c                "    g | ]\  }}|v	|S r)   r)   rP   s      r+   rU   z-_setup_low_rank_optimizer.<locals>.<listcomp>   s(    ```tq!1L_C_C_C_C_C_r*   paramsr
   z
Layerwise z( does not support gradient accumulation!c                    | j         6|                                           |                                           d S d S )N)gradstep	zero_grad)paramoptimizer_dicts    r+   optimizer_hookz1_setup_low_rank_optimizer.<locals>.optimizer_hook   sG    z%u%**,,,u%//11111 &%r*   r]   )lowerendswithparallel_moder   DISTRIBUTEDNotImplementedErroroptim_target_modules
ValueErrorr=   listrE   	TypeErrorr4   named_modulesr   r	   Linearloggerwarningappendlennamed_parametersgradient_accumulation_steps
parametersrequires_grad"register_post_accumulate_grad_hookr   update)r   r   rD   rF   rG   r    rH   is_layerwiseoptimizer_cls
all_linearmodule_namemoduletarget_module_existsis_regextarget_paramsnon_target_paramsparam_groupsr\   r^   r]   rT   s                      @@r+   _setup_low_rank_optimizerr~   e   s    "''))22;??L d*l.FFFKaF!"b"b"b"bccc%n5M (h^hhhiiid/$== 
0-0 0
 
 	

 }^>^^^___ 	4,c22rt7P7X7XY\^a7b7bfr7r  $2244 < <V)C%{D*
 *
 *
&h &"),, 	# tH t+rr^rrrsss# 	J 	"";#:;;;;
1$$ggg$JcggghhhXXXX5#9#9#;#;XXXM````u'='='?'?``` 
$%	=1L1L
  D+q00b.bbbccc& 	] 	]E$1MHug3F2G$\$\K[$\$\N5!!" 	m 	mE$1MHug3V3V2W$l$l[k$l$lN5!!	2 	2 	2 	2 	2
 %%'' 	I 	IE" I88HHH/!1> BCCCX|4555***r*   ctxc                X    | j                             ddd           t          | j         fS )zGet Adafactor optimizer.Fscale_parameterrelative_step)r    rs   r   r   s    r+   _get_adafactorr      s/    EE R RSSSc***r*   c                    ddl m} | j                            | j                   | j        j        t          j        k    r| j                            ddi           || j        fS )z/Get PyTorch AdamW optimizer (regular or fused).r   AdamWfusedT)	torch.optimr   r    rs   r!   r   rA   r   ADAMW_TORCH_FUSEDr   r   s     r+   _get_adamw_torchr      sh    !!!!!!000
x~999##WdO444#&&&r*   c                    	 ddl m} | j                            | j                   || j        fS # t
          $ r t          d          w xY w)z'Get Torch XLA syncfree AdamW optimizer.r   r   z7Trainer failed to import syncfree AdamW from torch_xla.)torch_xla.amp.syncfreer   r    rs   r!   ImportErrorre   r   s     r+   _get_adamw_torch_xlar      so    T000000##CO444c*** T T TRSSST	   -0 A
c                    	 ddl m} | j                            | j                   || j        fS # t
          $ r t          d          w xY w)zGet NPU Fused AdamW optimizer.r   )NpuFusedAdamWz3Trainer failed to import FusedAdamW from torch_npu.)torch_npu.optimr   r    rs   r!   r   re   )r   r   s     r+   _get_adamw_torch_npu_fusedr      so    P111111##CO444c222 P P PNOOOPr   c                    	 ddl m} | j                            | j                   || j        fS # t
          $ r t          d          w xY w)zGet Apex Fused Adam optimizer.r   )	FusedAdamzFTrainer tried to instantiate apex FusedAdam but apex is not installed!)apex.optimizersr   r    rs   r!   r   re   )r   r   s     r+   _get_adamw_apex_fusedr      so    c------##CO444#... c c cabbbcr   c                   t                      st          d          ddlm}m}m} | j        j        }d|v }d|v rdnd}d}| j        }d	|v r|}nod
|v r|}d| j        j	        | j        j
        fi}nMd|v r|}| j        }n>d|v r9ddlm}	 |	}t          | j                            d| j        j	                            t          | j                            d| j        j
                            t          | j                            dd                    ft          | j                            dd                    t          | j                            d| j        j                            d}d| j        v rt!          | j        d                   |d<   d| j        v rt!          | j        d                   |d<   d|i}
d|vr||
d<   | j                            |           | j                            |
           || j        fS )z;Get bitsandbytes optimizer (AdamW, Lion, RMSprop variants).ziYou need to install `bitsandbytes` in order to use bitsandbytes optimizers: `pip install -U bitsandbytes`r   )r   LionRMSproppaged8bit       Nadamlionbetasrmspropademamix)AdEMAMixbeta1beta2beta3gH.?alphag      @eps)r   r   r   t_alphat_beta3
optim_bitsis_paged)r   r   bitsandbytes.optimr   r   r   r   rA   r!   
adam_beta1
adam_beta2r#   r   floatgetadam_epsilonintr    rs   )r   r   r   r   
optim_namer   r   ru   additional_optim_kwargsr   
bnb_kwargss              r+   _get_bitsandbytes_optimizerr      sV   $&& 
w
 
 	
 8777777777J*$H
**JM!o	:		#*SX-@#(BU,V"W	j	 	 "%.	z	!	!//////  cn((#(2EFFGGcn((#(2EFFGGcn((&99::
 3>--gs;;<<++E383HIIJJ#
 #
 &&14S^I5N1O1O#I.&&14S^I5N1O1O#I.
+J
""!)
: 7888
+++#...r*   c                ,   	 ddl m} | j                            | j                   | j                            t          | j                            dd                    t          t          | j                            dd                    t          t          | j                            dd                    t          t          | j                            dd	                    d
           || j        fS # t          $ r t          d          w xY w)z!Get AnyPrecision AdamW optimizer.r   )AnyPrecisionAdamWuse_kahan_summationFalsemomentum_dtypefloat32variance_dtypecompensation_buffer_dtypebfloat16)r   r   r   r   z4Please install https://github.com/pytorch/torchdistx)torchdistx.optimizersr   r    rs   r!   r   r#   r   getattrr@   r   re   )r   r   s     r+   _get_adamw_anyprecisionr   &  s   Q;;;;;;##CO444##'01C1CDY[b1c1c'd'd")%1C1CDTV_1`1`"a"a")%1C1CDTV_1`1`"a"a-43>--.I:VV. .	 		
 		
 		
 !#"666 Q Q QOPPPQs   C6C9 9Dc                2    t           j        j        | j        fS )zGet SGD optimizer.)r@   rA   SGDr    r   s    r+   _get_sgdr   ;  s    ;?C000r*   c                2    t           j        j        | j        fS )zGet Adagrad optimizer.)r@   rA   Adagradr    r   s    r+   _get_adagradr   @      ; 444r*   c                2    t           j        j        | j        fS )zGet RMSprop optimizer.)r@   rA   r   r    r   s    r+   _get_rmspropr   E  r   r*   c                   t                      st          d          ddlm}m}m} t          j        |t          j        |t          j	        |t          j
        |t          j        |t          j        |i}t          | j                            dd                    t          | j                            dd                    t!          | j                            dd	                    | j                            d
d          d}t#          | j        | j        | j        j        ||| j                  \  }}| j        j        t          j	        k    r|                    ddd           ||fS )zGet GaLore optimizer.zYou need to install `galore_torch` in order to use GaLore optimizers. Install it with `pip install git+https://github.com/jiaweizzhao/GaLore`r   )GaLoreAdafactorGaLoreAdamWGaLoreAdamW8bitrank   update_proj_gap   scaleg      ?	proj_typestd)r   r   r   r   Fr   )r   r   galore_torchr   r   r   r   GALORE_ADAMWGALORE_ADAMW_8BITGALORE_ADAFACTORGALORE_ADAMW_LAYERWISEGALORE_ADAMW_8BIT_LAYERWISEGALORE_ADAFACTOR_LAYERWISEr   r#   popr   r~   r   r   rA   r    rs   )r   r   r   r   rF   galore_optim_kwargsru   r    s           r+   _get_galore_optimizerr   J  sn   $&& 
V
 
 	
 KJJJJJJJJJ 	#[(/'-{2O1? CN&&vs3344s~112CSIIJJs~))'48899^''U;;	  '@#)SX^->@SUXUi' '#M# x~888EE R RSSS***r*   c           
        t                      st          d          ddlm} t          j        |t          j        |i}t          | j        	                    dd                    | j        	                    dd          | j        	                    dd	          t          | j        	                    d
d                    t          | j        	                    dd                    | j        	                    dd          d}|                    | j                   t          | j        | j        | j        j        ||| j                  S )zGet Apollo optimizer.zYou need to install `apollo_torch` in order to use APOLLO optimizers. Install it with `pip install git+https://github.com/zhuhanqing/APOLLO`r   )APOLLOAdamWr   r   projrandom
scale_typechannelr   r   r         ?r   r   )r   r   r   r   r   r   )r   r   apollo_torchr   r   APOLLO_ADAMWAPOLLO_ADAMW_LAYERWISEr   r#   r   r   rs   r!   r~   r   r   rA   r    )r   r   rF   apollo_optim_kwargss       r+   _get_apollo_optimizerr   k  sA   $&& 
U
 
 	
 )((((( 	#[-{ CN&&vs3344""6844n((yAAs~112CSIIJJs~))'37788^''U;;  s///$#)SX^->@SUXUi  r*   c                    t                      st          d          | j        t          d          ddlm}m} d| j        j        v r|n|}| j	        
                    d| j        i           || j	        fS )zGet LOMO optimizer.zjYou need to install `lomo_optim` in order to use LOMO optimizers. Install it with `pip install lomo-optim`NzMYou need to pass a `model` in order to correctly initialize a LOMO optimizer.r   )AdaLomoLomoadar   )r   r   r   re   
lomo_optimr   r   r   rA   r    rs   )r   r   r   ru   s       r+   _get_lomo_optimizerr     s     
7
 
 	

 yhiii(((((((($66GGDM#) 4555#...r*   c                   t                      st          d          ddlm} | j                            t          | j                            dd                    t          | j                            dd                    t          | j                            dd	                    t          | j                            d
d	                    t          | j                            dd                    d           || j        fS )zGet GrokAdamW optimizer.z5Please install grokadamw with `pip install grokadamw`r   )	GrokAdamW
alpha_initg\(\?lamb       @gammag?grokking_signal_decay_rategradient_clippingr   )r   r   r   r   r   )	r   re   	grokadamwr   r    rs   r   r#   r   )r   r   s     r+   _get_grokadamwr    s    !## RPQQQ###### 2 2< F FGG#.,,VS99::3>--gs;;<<*/0B0BC_ad0e0e*f*f!&s~'9'9:Ms'S'S!T!T	
 	
   c***r*   c           	     l   t                      rGt          j        t          j                            d                    t          j        d          k     rt          d          t          j        t          j                            d                    t          j        d          k    rt          d          t          j        t          j                            d                    t          j        d          k    r	dd	lm}m} ndd	l	m}m} | j
        j        t          j        k    r|}n|}| j                            | j                            d
d          t%          | j                            dd                    d           | j                            | j                   || j        fS )z%Get TorchAO 4-bit or 8-bit optimizer.torchaoz0.4.0zYou need to have `torchao>=0.4.0` in order to use torch 4-bit optimizers. Install it with `pip install torchao` or follow the instructions here: https://github.com/pytorch/aor@   z2.4zYou need to have `torch>2.4` in order to use torch 4-bit optimizers. Install it with `pip install --upgrade torch` it is available on pipy. Otherwise, you need to install torch nightly.z0.11.0r   )	AdamW4bit	AdamW8bit
block_size   bf16_stochastic_roundr   )r  r  )r   r   parse	importlibmetadatar   torchao.optimr  r  torchao.prototype.low_bit_optimr   rA   r   ADAMW_TORCH_4BITr    rs   r#   r   r   r!   )r   r  r  ru   s       r+   _get_torchao_optimizerr    s   !! 
W]93E3M3Mi3X3X%Y%Y\c\ijq\r\r%r%r,
 
 	

 }Y'//8899W]5=Q=QQQ<
 
 	
 }Y'//	::;;w}X?V?VVV666666666HHHHHHHH
x~888!!.,,\3??%.s~/A/ABY[b/c/c%d%d	
 	
   000#...r*   c           	        t                      st          d          ddlm}m} i }d}| j        j        t          j        k    r0t          d          st          d          ddlm	} |}| j
        }d}nP| j        j        t          j        k    r
|}| j
        }n,| j        j        t          j        k    r|}nt          d	          | j        j        |d
<   |r| j        j        |d<   |                    t#          | j                            dd                    t#          | j                            dd                    d           | j                            |           || j        fS )zGet ScheduleFree optimizer.zwYou need to install `schedulefree` in order to use schedulefree optimizers. Install it with `pip install schedulefree.`r   )AdamWScheduleFreeSGDScheduleFreeTz1.4.0zYou need to install `schedulefree>=1.4.0` in order to use RAdamScheduleFree optimizer. Install it with `pip install schedulefree.`)RAdamScheduleFreeFzInvalid schedulefree optimizerweight_decaywarmup_stepsweight_lr_powerr   rg        )r  r  )r   r   schedulefreer  r  r   rA   r   SCHEDULE_FREE_RADAMr  r!   SCHEDULE_FREE_ADAMWSCHEDULE_FREE_SGDre   r  r  rs   r   r#   r   r    )r   r  r  r   require_warmupr  ru   s          r+   _get_schedule_free_optimizerr    s   $&& 
:
 
 	
 @??????? N
x~;;;(11 	>   	322222)"%/	>=	=	=)"%/	>;	;	;'9:::.1h.CN+ H25(2G/""$S^%7%78I3%O%OPPs~))#s3344	
 	
    7888#...r*   c                   t                      st          d          ddlm} | j                            dd          }|t          |          }| j                            dd          }|t          |          }| j        j	        | j
        d<   t          | j                            dd	                    ||d
}| j                            | j
                   | j                            |           || j        fS )z,Get StableAdamW optimizer from torch-optimi.zwYou need to install `torch-optimi` in order to use stable_adamw optimizers. Install it with `pip install torch-optimi`.r   )StableAdamWmax_lrN	kahan_sumr  decouple_lrF)r"  r   r!  )r   r   optimir  r#   r   r   r;   r   r  r!   r    rs   )r   r  r   r!  stable_adamw_kwargss        r+   _get_stable_adamwr%    s   $&& 
:
 
 	
 #"""""^$//Fv"";55IOO	&)h&;CON#CN..}eDDEE  000 3444,,,r*   zdict[str, OptimizerHandler]_OPTIMIZER_HANDLERS)r,   r-   r.   r"   )r:   r   r.   r;   )T)r   r   r   r   rD   rE   rF   r   rG   r   r    r   rH   r;   r.   rI   )r   r   r.   rI   )ur'   
__future__r   importlib.metadatar
  loggingcollections.abcr   dataclassesr   typingr   r   r@   	packagingr   r	   optimizationr   trainer_pt_utilsr   trainer_utilsr   training_argsr   r   utilsr   r   r   r   r   r   r   r   r   modeling_utilsr   r   	getLoggerr$   rj   r   r9   tupledictrE   OptimizerHandlerrC   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r%  	ADAMW_BNB
ADAMW_8BITPAGED_ADAMWPAGED_ADAMW_8BITADEMAMIXADEMAMIX_8BITPAGED_ADEMAMIXPAGED_ADEMAMIX_8BITLION	LION_8BIT
PAGED_LIONPAGED_LION_8BITRMSPROP_BNBRMSPROP_8BITRMSPROP_32BIT_BITSANDBYTES_OPTIMIZERSr   r   r   r   r   r   _GALORE_OPTIMIZERSr   r   _APOLLO_OPTIMIZERSr  ADAMW_TORCH_8BIT_TORCHAO_OPTIMIZERSr  r  r  _SCHEDULE_FREE_OPTIMIZERS	ADAFACTORADAMW_TORCHr   ADAMW_TORCH_XLAADAMW_TORCH_NPU_FUSEDADAMW_APEX_FUSEDADAMW_ANYPRECISIONr   ADAGRADRMSPROP	GROKADAMWSTABLE_ADAMWLOMOADALOMOfromkeysr&  r(   r)   r*   r+   <module>rZ     s     # " " " " "      $ $ $ $ $ $ ! ! ! ! ! ! % % % % % % % %              # # # # # # 5 5 5 5 5 5 5 5 5 5 5 5 7 7 7 7 7 7 7 7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  1//////000000		8	$	$            -.c4S>6I0JJK    8 $(V+ V+ V+ V+ V+|+ + + +' ' ' 'T T T TP P P Pc c c c// // // //dQ Q Q Q*1 1 1 1
5 5 5 5
5 5 5 5
+ + + +B   :/ / / /$+ + + +& /  /  /  /F)/ )/ )/ )/X- - - -D # !&"  & $#).-  )  ##  &&$  n 0$&6"$8(*D#%:%'>LLn!2,/4 mm,.IJJ4  mm&(=>>!4" mm&(=>>#4$ mm')?@@%4& mm-/KLL'4       r*   