
    bi&                     |    d dl Z d dlZd dlZd dlmZ ddlmZmZmZm	Z	  e	j
        e          ZdefdZd Zd	dZdS )
    N)
DataLoader   )WEIGHTS_NAMEPushToHubMixinis_torch_xla_availablelogging
dataloaderc                     t                      redd lmc m} t	          | |j                  s
J d            dd lmc m} |                    |	                                d          }|| j
        d<   | S | S )Nr   zPThe dataloader must be a `torch_xla.distributed.parallel_loader.MpDeviceLoader`.)fsdpNinput_sharding)r   %torch_xla.distributed.parallel_loaderdistributedparallel_loader
isinstanceMpDeviceLoadertorch_xla.distributed.spmdspmdShardingSpecget_global_mesh_parallel_loader_kwargs)r	   plxssharding_specs       X/root/projects/butler/venv/lib/python3.11/site-packages/transformers/integrations/tpu.pytpu_spmd_dataloaderr      s     :::::::::*b&788 	
 	
^	
 	
 	
 	0////////(:(:(<(<nMM?L
*+;<    c                 8   ddl mc m ddlmc m ddlm} 	 ddlm	 ddlm
 ddlm}m} rddlm n# t           $ r t!          d	          w xY wd}d}t#          | d
d          }|j                            d|          }	|j        d         dk    r"t)          j        ||j        d                   }n]|	[t-                      }
|	D ]4} || |          }|t/          d          |
                    |           5t)          j        ||
          }|j        }|j        d         r:| j        j        r&t8                              d           d| j        _        fd}rfd} | |||          } n | f||d|} di ffd	}|_        | S )a.  
    Wraps a model with XLA Fully Sharded Data Parallelism (FSDP).

    Handles both FSDP v1 (`XlaFullyShardedDataParallel`) and v2 (`SpmdFullyShardedDataParallel`),
    including auto-wrap policies, gradient checkpointing, and patching `xm.optimizer_step`.

    Args:
        model (`torch.nn.Module`): The model to wrap.
        args (`TrainingArguments`): The training arguments containing FSDP configuration.
        is_fsdp_xla_v2_enabled (`bool`): Whether FSDP v2 (SPMD) is enabled.

    Returns:
        `torch.nn.Module`: The FSDP-wrapped model.
    r   Nr   )get_module_class_from_name)XlaFullyShardedDataParallel)checkpoint_module)size_based_auto_wrap_policytransformer_auto_wrap_policy)SpmdFullyShardedDataParallelzJMissing XLA FSDP related module; please make sure to use torch-xla >= 2.0._no_split_modulestransformer_layer_cls_to_wrapmin_num_params)r&   z@Could not find the transformer layer class to wrap in the model.)transformer_layer_clsxla_fsdp_grad_ckptzX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fc                 :    sn} | |           g|R i |S N )margskwargs
target_clsFSDPFSDPv2r    is_fsdp_xla_v2_enableds       r   auto_wrapper_callablez2wrap_model_xla_fsdp.<locals>.auto_wrapper_callablet   s=    %;GJ://22DTDDDVDDDr   c                    ddl m} d }t          | t          j                  r| }n5t          | t
                    r	| d         }nt          | |          r| j        }|t          d                              ||d           d S )Nr   )CausalLMOutputWithPastr   zASomething went wrong, the output of the model shouldn't be `None`)r   NN)	modeling_outputsr5   r   torchTensortuplelogits
ValueErrormark_sharding)outputmeshr5   real_outputr   s       r   shard_outputz)wrap_model_xla_fsdp.<locals>.shard_output{   s    AAAAAAK&%,// ,$FE** ,$QiF$:;; ,$m" !deee[$0DEEEEEr   )r@   auto_wrap_policyr3   )rA   r3   c                 N     | j         di |}|r                                 |S )Nr+   )step	mark_step)	optimizerbarrieroptimizer_argslossxms       r   patched_optimizer_stepz3wrap_model_xla_fsdp.<locals>.patched_optimizer_step   s4    y~//// 	LLNNNr   )torch_xla.core.xla_modelcore	xla_modelr   r   r   trainer_pt_utilsr   torch_xla.distributed.fsdpr   r    torch_xla.distributed.fsdp.wrapr!   r"   7torch_xla.experimental.spmd_fully_sharded_data_parallelr#   ImportErrorgetattrfsdp_configget	functoolspartialset	Exceptionaddxla_fsdp_configconfig	use_cacheloggerwarning_onceoptimizer_step)modelr-   r2   r   r!   r"   rA   r3   %default_transformer_cls_names_to_wrap"fsdp_transformer_layer_cls_to_wraptransformer_cls_to_wraplayer_classtransformer_clsfsdp_kwargsr@   rJ   r0   r1   r    rI   r   s     `             @@@@@r   wrap_model_xla_fsdprh   .   s,    *))))))))+++++++++======hRRRRRR@@@@@@	
 	
 	
 	
 	
 	
 	
 	

 " 	       h h hfgggh  ,3E;NPT,U,U))-)9)=)=')N* *& ()A--$,'8HIY8Z
 
 
 
,	7"%%%= 	= 	=K88LLO& bccc'++O<<<<$,("9
 
 
 &K,- 
E<! 	+j   &+EL"	E 	E 	E 	E 	E 	E 	E 	E
  
	F 	F 	F 	F 	F %-"7	
 
 
 
-"7
 
 	
 
 38       /BLs	   = Ac           	      @   ddl mc m} ||n|j        }t                              d|            |                                 |                    d          rIt          j	        |d           t          j        |t          j                            |d                     t          f}|                    d	           |rq|                                 |                                 d
}t          j                            |d|j         d|j         dt(                     }	|                    ||	d           |                    d           |j        rddlm}
  |
t          j                            |d          dt(           d          \  }}| j        j        } |                    |           }t5          ||          r|                    ||           nt                              d           |                    |t          j                            |t(                               nFt5          | |          st5          |                    |           |          rV|                    |                               ||j        |                    |                                                      nt                              d           |                    |                                           }|                    |t          j                            |t(                               nB|                     ||j        |                    |                                                      ||j        r|                    |           dS dS dS )a  
    Saves a model checkpoint on TPU/XLA devices.

    Handles FSDP v1 sharded checkpoints (with consolidation on master), as well as
    standard XLA model saving via `save_pretrained` or `xm.save`.

    Args:
        model (`torch.nn.Module`): The model to save.
        args (`TrainingArguments`): The training arguments.
        accelerator (`Accelerator`): The accelerator instance.
        processing_class: The processing class (tokenizer/processor) to save alongside the model.
        is_fsdp_xla_v1_enabled (`bool`): Whether FSDP XLA v1 is enabled.
        output_dir (`str`, *optional*): The directory to save to. Defaults to `args.output_dir`.
    r   NzSaving model checkpoint to F)localT)exist_okztraining_args.binsaving_checkpoint)ra   shard_metadatarankz-of--)master_onlysave_full_checkpoints)%consolidate_sharded_model_checkpoints zrank*-of-*-)ckpt_prefixckpt_suffix
save_model)
state_dictzETrainer.model is not a `PreTrainedModel`, only saving its state dict.)is_main_processrw   )rK   rL   rM   
output_dirr^   inforD   is_master_ordinalosmakedirsr7   savepathjoinr   
rendezvousrw   get_shard_metadataprocess_index
world_sizer   should_saverO   rr   moduleunwrap_modelr   save_pretrained_maybe_convert_to_cpu)ra   r-   acceleratorprocessing_classis_fsdp_xla_v1_enabledry   rI   supported_classesckpt	ckpt_pathrr   full_state_dict_unwrapped_modelrw   s                  r   save_tpu_checkpointr      s    *)))))))))54?J
KK:j::;;;LLNNN	%(( H
J....
4j2EFFGGG ()MM%&&& *
%%''#6688
 
 GLL-lD4F-l-lDO-l-l^j-l-lmm	
iU333
-... 	QXXXXXX!F!FGLLR888,88 " " "OQ
 L'E)66u==O/+<== Q//
/WWWWcdddj,)O)OPPP011 
k..u557HII 		H$$U++;; $ 033E4D4D4F4FGG <     KK_```11%2B2B2D2DEEJGGJZ F FGGGG ,//0@0@0B0BCC 	 	
 	
 	

 #(8#((44444 $###r   r*   )rV   r|   r7   torch.utils.datar   utilsr   r   r   r   
get_logger__name__r^   r   rh   r   r+   r   r   <module>r      s        				  ' ' ' ' ' ' Q Q Q Q Q Q Q Q Q Q Q Q 
	H	%	%J    &t t tnJ5 J5 J5 J5 J5 J5r   