
    bi-                         d dl mZ ddlmZ erddlmZ ddlmZmZm	Z	m
Z
mZ ddlmZ  e	            r
d dlZdd	lmZ  ej        e          ZdZ G d
 de          ZdS )    )TYPE_CHECKING   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_kernels_availableis_torch_availableis_triton_availablelogging)get_module_from_nameN)WeightConverterc                        e Zd ZdZdZ fdZd Zd Zddded	e	fd
Z
ddZ	 dddde	fdZd Zd Zd Zd Zed	e	fd            Zd Zd Z xZS )Mxfp4HfQuantizerz/
    FP4 quantization using fbgemm kernels
    Fc                 J     t                      j        |fi | d | _        d S N)super__init__triton_kernels_hub)selfquantization_configkwargs	__class__s      b/root/projects/butler/venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_mxfp4.pyr   zMxfp4HfQuantizer.__init__0   s1    ,77777"&    c                     | j         5	 ddlm}  |d          | _         n# t          $ r t          d          w xY w| j         S )z3Lazy import and initialize kernels only when neededNr   )
get_kernelz(kernels-community/gpt-oss-triton-kernelsz2kernels package is required for MXFP4 quantization)r   integrations.hub_kernelsr   ImportError)r   r   s     r   _lazy_import_kernelsz%Mxfp4HfQuantizer._lazy_import_kernels4   sp    "*XAAAAAA*4*5_*`*`'' X X X!"VWWWX&&s     :c                    t                      st          d          | j        j        rd S t          j                                        s\t          j                                        s>| j        r(t          
                    d           d| j        _        d S t          d          t                      st          d          t          j                                        r d}t          d          ot                      }nAt          j                                        }|dk    }t          d          ot                      }| j        rU|s(t          
                    d	           d| j        _        d S |s(t          
                    d
           d| j        _        d S n"|st!          d          |st!          d          | j        s|                                  |                    d          }|t          
                    d           d S t'          |t(                    rB| j        s=d|                                v sd|                                v rt!          d          d S d S d S )NzqUsing mxfp4 quantization requires torchPlease install the latest version of torch ( pip install --upgrade torch )z^Using MXFP4 quantized models requires a GPU, we will default to dequantizing the model to bf16Tz-Quantizing a model using MXFP4 requires a GPUz9Using mxfp4 requires Accelerate: `pip install accelerate`z3.5.0)      z3.4.0u   MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series) We will default to dequantizing the model to bf16.zMXFP4 quantization requires Triton and kernels installed: CUDA requires Triton >= 3.4.0, XPU requires Triton >= 3.5.0, we will default to dequantizing the model to bf16u   MXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series) zuMXFP4 quantization requires Triton and kernels installed: CUDA requires Triton >= 3.4.0, XPU requires Triton >= 3.5.0
device_mapzYou have loaded an FP4 model on CPU and have a CUDA/XPU device available, make sure to set your model on a GPU/XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or device_map = 'xpu'. cpudiskzYou are attempting to load an FP4 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r
   r   r   
dequantizetorchcudais_availablexpupre_quantizedloggerwarning_onceRuntimeErrorr   r   r	   get_device_capability
ValueErrorr    get
isinstancedictvalues)r   argsr   gpu_is_supportedkernels_availablecompute_capabilityr$   s          r   validate_environmentz%Mxfp4HfQuantizer.validate_environment?   s   !## 	]  
 #. 	Fz&&(( 	T1G1G1I1I 	T! T##t   7;(3"#RSSS&(( 	[YZZZ9!!## 	X# 3G < < WAUAWAW!&!A!A!C!C1V; 3G < < WAUAWAW 	# ##I   7;(3$ ##    7;(3 " 		 r   # 	 H   ! 	(%%'''ZZ--
V     
D)) 	% 5J4E4E4G4G+G+G6U_UfUfUhUhKhKh n  	 	 KhKhr   modelr   
param_namereturnc                 h    ddl m} t          ||          \  }}t          ||          r|dv rdS dS dS )Nr   Mxfp4GptOssExperts)down_proj_biasgate_up_proj_biasFT)integrationsr@   r   r3   )r   r;   r<   r   r@   moduletensor_names          r   param_needs_quantizationz)Mxfp4HfQuantizer.param_needs_quantization   sV    55555525*EEf011 	EEEu4ur   c                     t           j                                        r t           j                                         d S t           j                                        r t           j                                         d S d S r   )r(   r)   r*   empty_cacher+   )r   r;   r   s      r   #_process_model_after_weight_loadingz4Mxfp4HfQuantizer._process_model_after_weight_loading   sl    :""$$ 	$J""$$$$$Y##%% 	$I!!#####	$ 	$r   use_kernelsc                     ddl m} |r&t                              d           d| j        _        |                     || j        j        |j                  | _         ||| j        | j                  }d S )Nr   )replace_with_mxfp4_linearzYou are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=FalseT)modules_to_not_convertr   )	rC   rL   r-   r.   r   r'   get_modules_to_not_convertrM   _keep_in_fp32_modules)r   r;   rJ   r   rL   s        r   $_process_model_before_weight_loadingz5Mxfp4HfQuantizer._process_model_before_weight_loading   s     	=<<<<<  	7e   37D$/&*&E&E4+BED_'
 '
# *)$*E[_[s
 
 
r   c                     d|j         j        v r0t          |dd           |j                            ddddd           |S )NGptOssConfigbase_model_tp_plangrouped_gemmz(layers.*.mlp.experts.gate_up_proj_blocksz(layers.*.mlp.experts.gate_up_proj_scalesz%layers.*.mlp.experts.down_proj_blocksz%layers.*.mlp.experts.down_proj_scales)r   __name__getattrrS   updater   configs     r   update_tp_planzMxfp4HfQuantizer.update_tp_plan   ^    V-666v3T::F)00DRDRAOAO	    r   c                     d|j         j        v r0t          |dd           |j                            ddddd           |S )NrR   base_model_ep_planrT   rU   )r   rV   rW   r^   rX   rY   s     r   update_ep_planzMxfp4HfQuantizer.update_ep_plan   r\   r   c                     ddl m} |                                }t          |j        dd          }t          |j        dd          }|                                D ]\  }}t          ||          rt          |d          rt          |d          rt|j        j	        j
                            |j        j	        j                                      d	d
                              |d	dd          || d<   |j        j        j	        j
                            |j        j        j	        j                                      d	d
          || d<   |j        j	        j
                            |j        j	        j                                      d	d
                              ||dd	          || d<   |j        j        j	        j
                            |j        j        j	        j                                      d	d
          || d<   i }||fS )Nr   r?   num_local_experts    hidden_sizei@  gate_up_proj	down_projZ      z.gate_up_proj_blocksz.gate_up_proj_scalesz.down_proj_blocksz.down_proj_scales)rC   r@   
state_dictrW   rZ   named_modulesr3   hasattrrd   storagelayoutunswizzle_datadata	transposereshapegate_up_proj_precision_configweight_scalere   down_proj_precision_config)	r   r;   r@   rj   ra   rc   namerD   metadatas	            r   get_state_dict_and_metadataz,Mxfp4HfQuantizer.get_state_dict_and_metadata   s   555555%%''
 $EL2ErJJelM4@@!//11 	 	LD&6#566FN33 FK00 '/6EEfFYFaFfggYr2&&W.B;; d8889 8EMTcc<IQV iB'' d8889 $,3BB6CSC[C`aaYr2&&W.RDD d5556 5BJQ``9FNS iB'' d5556 8##r   c                     dS )NT r   s    r   is_serializablez Mxfp4HfQuantizer.is_serializable   s    tr   c                 :    t                               d           dS )NzMXFP4 quantization don't support training, please consider dequantizing the model first by passing quantization_config=Mxfp4Config(dequantize=True) to .from_pretrained()F)r-   r.   r{   s    r   is_trainablezMxfp4HfQuantizer.is_trainable   s'     x	
 	
 	
 ur   c                 $    ddl m}  ||           S )Nr   )Mxfp4Quantize)integrations.mxfp4r   )r   r   s     r   get_quantize_opsz!Mxfp4HfQuantizer.get_quantize_ops   s$    666666}T"""r   c                     ddl m}m} | j        rJ| j        j        rt          ddgd ||           g          gS t          ddgd ||           g          gS g S )Nr   )Mxfp4DequantizeMxfp4Deserialize_blocks_scales )source_patternstarget_patterns
operations)r   r   r   r,   r   r'   r   )r   r   r   s      r   get_weight_conversionsz'Mxfp4HfQuantizer.get_weight_conversions  s    JJJJJJJJ 	'2 #)2I(>(*$3OD$9$9#:    $)2I(>(*$4$4T$:$:#;    	r   )r;   r   )F)rV   
__module____qualname____doc__requires_calibrationr   r    r:   strboolrF   rI   rP   r[   r_   rx   r|   propertyr~   r   r   __classcell__)r   s   @r   r   r   )   s]         !' ' ' ' '	' 	' 	'I I IV.? S _c    $ $ $ $ "
 
 
 
 
 
 
0    %$ %$ %$N   d    X# # #
      r   r   )typingr   baser   modeling_utilsr   utilsr   r	   r
   r   r   quantizers_utilsr   r(   core_model_loadingr   
get_loggerrV   r-   r   r   rz   r   r   <module>r      s%   !                  1000000              3 2 2 2 2 2  5LLL444444		H	%	% n n n n n{ n n n n nr   