
    bi7                        d dl mZ ddlmZ ddlmZ ddlmZmZ ddl	m
Z
mZmZmZmZ  e            r
d dlZd dlmZ  e
            rd d	lmZ  e            Z e            resd dlZ ej        e          Z G d
 de          Z G d dej        j                  Z G d dej                  Z ed          d             Z	 ddee          dz  fdZ!dS )    )	lru_cache   )ACT2FN)ConversionOps)get_module_from_nameshould_convert_module)is_accelerate_availableis_fbgemm_gpu_availableis_torch_availableis_torch_xpu_availableloggingN)nn)init_empty_weightsc            	           e Zd Zd Z	 ddeeej        eej                 z  f         dej	        j
        dz  deeej        f         fdZdS )FbgemmFp8Quantizec                     || _         d S N)hf_quantizer)selfr   s     _/root/projects/butler/venv/lib/python3.11/site-packages/transformers/integrations/fbgemm_fp8.py__init__zFbgemmFp8Quantize.__init__-   s    (    N
input_dictmodelreturnc                 &   t          |                                          d         \  }}|d         }ddlm} t	          ||          \  }}t          ||          r@|dk    r|                    dd          }	|	j        }
|	                    d|
d                   }t          |          \  }}|                    |
          }|                    dd          }|                    |
d         d|
d                   }n|dk    r|                    dd          }	|	j        }
|	                    d|
d                   }t          |          \  }}|                    |
          }|                    dd          }|                    |
d         |
d         d          }nPt          |          \  }}t          j                            |                    |j        d         d                    }|t          j                            |          | d|iS )	Nr   r   )FbgemmFp8Llama4TextExpertsgate_up_proj   	down_proj_scale)tupleitemsintegrationsr   r   
isinstance	transposeshapereshapequantize_fp8_per_rowtorchr   	Parameterview)r   r   r   kwargs
target_keyvaluer   moduletensor_nametransposed_paramoriginal_shapeflattened_paramnew_value_flatweight_scale_flat	new_valueweight_scales                   r   convertzFbgemmFp8Quantize.convert0   s&    "*"2"2"4"455a8
Ea======25*EEf899 #	[n,, $)??1a#8#8  "2!7"2":":2~b?Q"R"R 5I4Y4Y1 1 +22>BB	%//155	0889JA~^_O`aa++ $)??1a#8#8  "2!7"2":":2~b?Q"R"R 5I4Y4Y1 1 +22>BB	%//155	0889JN[\L]_`aa&:5&A&A#I| 8--l.?.?@RST@UWX.Y.YZZLEH..y99j;P;P;PR^__r   r   )__name__
__module____qualname__r   dictstrr+   Tensorlistr   Moduler:    r   r   r   r   ,   s        ) ) ) )-2` 2`elT%,-???@2` x%2`
 
c5<	 2` 2` 2` 2` 2` 2`r   r   c                   2     e Zd Zej        f fd	Zd Z xZS )FbgemmFp8Linearc                 x   t                                          |||           || _        || _        t          j                            t	          j        ||f|                    | _        t          j                            t	          j        |dft          j	                            | _
        |                     dt	          j        dgt          j                  d           |rIt          j                            t	          j        | j        t          j	                            | _        d S d | _        d S )Ndtyper   input_scale_ubF
persistent)superr   in_featuresout_featuresr+   r   r,   zerosweightfloat32r9   register_bufferfloatbias)r   rM   rN   rT   rH   	__class__s        r   r   zFbgemmFp8Linear.__init__f   s    lD999&(h((lK5PX])^)^)^__!H..u{L!;LTYTa/b/b/bcc-u{A3ek/R/R/R_deee 	**5;8IRWR_+`+`+`aaDIIIDIIIr   c                    g |j         d d         dR }t          |                    d|j         d                                                   | j                  \  }}| j                            t          j                  }t          rat          j
        || j                                        |                    d          |                                |j        | j                  }nAt          j        j                            || j        ||d          }| j        
|| j        z   n|}|                    |j                  }|                    |          }~~|S )Nr    )scale_ub)scale_ascale_b	out_dtyperT   Tuse_fast_accum)r(   r*   r-   
contiguousrI   r9   tor+   rQ   _is_torch_xpu_available
_scaled_mmrP   t	unsqueezerH   rT   opsfbgemmf8f8bf16_rowwisedevicer)   )r   xoutput_shapex_quantizedx_scaleweight_scale_float32outputs          r   forwardzFbgemmFp8Linear.forwardt   sJ   *"*r**  4AFF2qwr{4K4K4V4V4X4XcgcvwwwW
  $033EMBB" 	M%))"--,..00'Y  FF Y%66T['3GX\ 7  F ,09+@Vdi''fF18$$--r   )r;   r<   r=   r+   float8_e4m3fnr   rm   __classcell__rU   s   @r   rE   rE   e   sR        >C>Q            r   rE   c                   2     e Zd Zej        f fd	Zd Z xZS )r   c                    t                                                       |j        | _        |j        | _        |j        | _        | j        | _        t          |j                 | _	        t          j                            t          j        | j        | j        d| j        z  ft          j                            | _        t          j                            t          j        | j        d| j        dz  ft          j                            | _        t          j                            t          j        | j        | j        | j        ft          j                            | _        t          j                            t          j        | j        | j        dft          j                            | _        |                     dt          j        dgt          j                  d           d S )Nr   rG   r   rI   FrJ   )rL   r   num_local_expertsnum_expertsintermediate_sizehidden_size
expert_dimr   
hidden_actact_fnr+   r   r,   rO   rn   r   rQ   gate_up_proj_scaler!   down_proj_scalerR   rS   )r   configrH   rU   s      r   r   z#FbgemmFp8Llama4TextExperts.__init__   s   !3!'!9!-0V./!H..K)4+;Q=PQY^Ylmmm
 
 #(("4"4K)1do.AB%-XXX#
 #
 ++K)4?D<LMUZUhiii
 
  %x11K)4+;Q?u}UUU 
  
 	-u{A3ek/R/R/R_deeeeer   c           
      
   |                     | j        d| j                  }d}t          j        |          }t          | j                  D ]}||         }|                    d| j                  }t          ||| j                  \  }}| j	        j
        d         dz  }	| j                            t          j                  }
t          rt          j        || j	        |                             dd          d|	                                                                         |                    d          |
|         d         d|	                              dd                                                                          |j                  }t          j        || j	        |                             dd          |	d                                                                         |                    d          |
|         d         |	d                              dd                                                                          |j                  }n8t          j        j                            || j	        |                             dd          d|	                                         ||
|         d         d|	                              dd                                          d          }t          j        j                            || j	        |                             dd          |	d                                         ||
|         d         |	d                              dd                                          d          }||                     |          z  }t          ||| j                  \  }}| j                            t          j                  }t          rt          j        || j        |                             dd                                          |                    d          ||                              dd                                                                          |j                  }nt          j        j                            || j        |                             dd                                          |||                              dd                                          d          }|||<   |                    |j                  }|                     d| j                  S )	z
        Args:
            hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
        Returns:
            torch.Tensor: (batch_size * token_num, hidden_size)
        r    Nr   r   r   )rX   rY   rZ   Tr[   )r-   rt   rv   r+   
empty_likeranger)   r*   rI   r   r(   rz   r^   rQ   r_   r`   r'   r]   ra   rb   rH   rc   rd   re   ry   r{   r!   rf   )r   hidden_states
num_tokensnext_statesiexpert_hiddenexpert_hidden_reshapedexpert_quantizedexpert_scalesharded_expert_dimgate_up_proj_scale_float32gateup	activatedactivated_quantizedactivated_scaledown_proj_scale_float32expert_outputs                     r   rm   z"FbgemmFp8Llama4TextExperts.forward   s    &**4+;RAQRR
 &}55t'(( @	+ @	+A)!,M%2%:%:2t?O%P%P"-A&
D4G. .*l "&!2!8!<!A)-)@)C)CEM)R)R&& '$%a(221a889L:L9LMXXZZ\\^^(222666q9!<=P>P=PQVVWY[\]]hhjjllnn+1   %$%a(221a889K9L9LMXXZZ\\^^(222666q9!<=O=P=PQVVWY[\]]hhjjllnn+1   y'88$%a(221a889L:L9LMXXZZ .q1!45H6H5HINNrSTUU``bb#' 9   Y%66$%a(221a889K9L9LMXXZZ .q1!45G5H5HINNrSTUU``bb#' 7   T[[...I3G	S]_c_r3s3s0&*&:&=&=em&L&L#&  % 0'N1%//155@@BB+55b993A6;;BBBMMOOQQSS+1! ! ! !&	 0 A A'N1%//155@@BB#+A.33B::EEGG#' !B ! ! +KNN!nn]%9::D$4555r   )r;   r<   r=   r+   rQ   r   rm   ro   rp   s   @r   r   r      s^        %*] f f f f f f0P6 P6 P6 P6 P6 P6 P6r   r   r   )maxsizec                  h    t           rddlm}   | d          j        S t          j        j        j        S )Nr   
get_kernelzkernels-community/fp8-fbgemm)r_   hub_kernelsr   r*   r+   rc   rd   r   s    r   get_quantize_fp8_per_rowr      s?     O++++++z899NN900r   Fmodules_to_not_convertc                 p   t                      ad}|ri nddi}|                                 D ]\  }}t          ||          sd}	t	          d          5  |j        j        dk    r2t          | j        d| j                  }
t          |
p| j                  }	nOt          |t          j                  r5t          |j        |j        |j        dufi |}	|	                    d           ddd           n# 1 swxY w Y   |	|                     ||	           d}|st&                              d           | S )	a  
    A helper function to replace all `torch.nn.Linear` modules by `FbgemmFp8Linear` modules.
    This will enable running your models using high performance fp8 kernel from FBGEMM library.

    Parameters:
        model (`torch.nn.Module`):
            Input model or `torch.nn.Module` as the function is run recursively.
        modules_to_not_convert (`list[`str`]`, *optional*, defaults to `None`):
            Names of the modules to not convert. In practice we keep the `lm_head` in full precision for numerical stability reasons.
        quantization_config (`FbgemmFp8Config`):
            The quantization config object that contains the quantization parameters.
        pre_quantized (`book`, defaults to `False`):
            Whether the model is pre-quantized or not
    FrH   NT)include_buffersLlama4TextExpertstext_configzYou are loading your model using FP8 quantization but no linear modules were found in your model. Please double check your model architecture, or submit an issue on github if you think this is a bug.)r   r*   named_modulesr   r   rU   r;   getattrr|   r   r&   r   LinearrE   rM   rN   rT   requires_grad_set_submoduleloggerwarning)r   r   quantization_configpre_quantizedtp_planhas_been_replacedmodule_kwargsmodule_namer1   
new_moduler   s              r   replace_with_fbgemm_fp8_linearr   	  s   $ 455'<BBgt_M$2244 ! !V$[2HII 	
555 	1 	1(,???
 &elM5<PP78Su|TT

FBI.. 1,&'Kt+  $	 
 ))%000	1 	1 	1 	1 	1 	1 	1 	1 	1 	1 	1 	1 	1 	1 	1" K444  
	
 	
 	
 Ls   BC33C7	:C7	)NNFN)"	functoolsr   activationsr   core_model_loadingr   quantizers.quantizers_utilsr   r   utilsr	   r
   r   r   r   r+   r   
accelerater   r_   fbgemm_gpu.experimental.gen_ai
fbgemm_gpu
get_loggerr;   r   r   r   rE   rB   r   r   rA   r?   r   rC   r   r   <module>r      s-                      . . . . . . U U U U U U U U               LLL .------0022  *%< *))))		H	%	%6` 6` 6` 6` 6` 6` 6` 6`r, , , , ,eho , , ,^i6 i6 i6 i6 i6 i6 i6 i6X 11 1 1 tx: :#'9t#3: : : : : :r   