
    bi]                        d dl mZ d dlmZ d dlmZmZmZmZ  e            rddl	Z	ddl
mZ ddlZddlmZ ddl
mZ  ej        e          Zdad Zdee         dz  d	e	j        d
efdZe	j        Z e	j        e          j        Z  e	j        e          j!        Z"ej#        dej$        fd            Z%d.de	j&        ded
e'e	j&        e	j&        f         fdZ(ej#        dej$        dej$        dej$        dej$        fd            Z)ej#        dej$        dej$        dej$        dej$        fd            Z*e	j+        fde	j&        de	j&        de	j&        de	j&        dee         d	e	j        d
e	j&        fdZ,e	j+        fde	j&        de	j&        de	j&        de	j&        dee         d	e	j        d
e	j&        fdZ-e	j.        de	j+        fde	j&        de	j&        de	j&        d e	j&        de'eef         dz  d	e	j        d
e	j&        fd!            Z/ G d" d#ej0                  Z1d$ Z2 G d% d&ej3                  Z4	 d/d(ee5         dz  fd)Z6 G d* d+e          Z7 G d, d-e          Z8dS )0   )ConversionOps)should_convert_module)is_kernels_availableis_torch_accelerator_availableis_torch_availablelogging    N)
functionalc                      t           J	 ddlm}   | d          a n7# t          $ r*}t                              d| d           da Y d}~nd}~ww xY wt           rt           ndS )zALazily load the CUTLASS quantization kernel from HuggingFace Hub.N   )
get_kernelzRedHatAI/quantizationz,Failed to load CUTLASS quantization kernel: . Falling back to Triton.F)_quantization_kernelhub_kernelsr   	Exceptionloggerwarning_once)r   es     d/root/projects/butler/venv/lib/python3.11/site-packages/transformers/integrations/finegrained_fp8.py_get_quantization_kernelr   "   s     #	)//////#-:.E#F#F   	) 	) 	) kq k k klll#(      	) $8ATAs    
A A

A
block_sizeoutput_dtypereturnc                    t                      r,t          j                                        rt	                      sdS |t          j        t          j        fvrdS | dS t          |           dk    s| d         dk    s| d         dk    rdS t          j                                        }|d         dz  |d         z   }t                      }|dS 	 |
                    |          S # t          $ r Y dS w xY w)a;  
    Check if CUTLASS blockwise FP8 matmul is supported for the given block size and output dtype.

    CUTLASS blockwise kernels require:
    - SM90+ (Hopper or newer)
    - Block size [128, 128] for weights
    - Block size [1, 128] for activations (handled implicitly)
    - Output dtype bfloat16 or float16
    FNr   r	      r   
   )r   torchcudais_availabler   bfloat16float16lenget_device_capabilityr   $cutlass_scaled_mm_supports_block_fp8r   )r   r   
capabilitycuda_capabilitykernels        r   _supports_cutlassr(   0   s     uz'>'>'@'@ H\H^H^ u ENEM:::u u
:!z!}33z!}7K7Ku 1133J mb(:a=8O &''F~u::?KKK   uus   C$ $
C21C2
BLOCK_SIZEc                    t          j        d          }||z  t          j        d|          z   }t          j        | |z                                 t           j                  }t          j        t          j        |                    dz  }||z  }|                    |j        j	                  }t          j
        ||z   |           t          j
        ||z   |           d S )Nr	   axisg      |@)tl
program_idarangeloadtofloat32maxabsdtype
element_tystore)	x_ptry_ptrs_ptrr)   pidoffsxsys	            r   act_quant_kernelr@   ]   s    
-Q


Cbi:666D
  ,,A
rvayyE!A	AA	U[#$$AHUT\1HUS[!    r   r=   c                 x                                      sJ  j        d         |z  dk    sJ t          j         t          j                  }  j        g                                  d d                              d          |z  R dt          j        i} fd}t          |          |||           ||fS )Nr	   r5   r5   c                 `    t          j                                        | d                   fS )Nr)   )tritoncdivnumel)metar=   s    r   gridzact_quant.<locals>.grido   s%    AGGIItL'9::<<rA   )r)   )	is_contiguousshaper   
empty_likefloat8_e4m3fn	new_emptysizer2   r@   )r=   r   r?   r>   rJ   s   `    r   	act_quantrQ   i   s    ??72;#q((((%"5666ARQVVXXcrc]RAFF2JJ*$<RRREMRRA= = = = = T1az::::a4KrA   BLOCK_SIZE_MBLOCK_SIZE_NBLOCK_SIZE_KGROUP_SIZE_Mc                    t          j        d          }t          j        ||          }t          j        ||          }||z  }||z  }||z  }t          ||z
  |          }|||z  z   }||z  |z  } ||z  t          j        d|          z   |z  }!| |z  t          j        d|          z   |z  }"t          j        d|          }#| |!dddf         |
z  |#dddf         |z  z   z   }$||#dddf         |z  |"dddf         |z  z   z   }%||!|z  z   }&|"|z  }'||'|z  z   }(t          j        ||ft           j                  })t          dt          j        ||                    D ]}*t          j        |$|#dddf         ||*|z  z
  k     d          }+t          j        |%|#dddf         ||*|z  z
  k     d          },|*|z  }-|-|	z  }.t          j        |&|.|z  z             }/t          j        |(|.|z  z             }0|)t          j	        |+|,          |/dddf         z  |0dddf         z  z  })|$||z  z  }$|%||z  z  }%|j
        j        t           j        k    r |)                    t           j                  }1nY|j
        j        t           j        k    r |)                    t           j                  }1n|)                    t           j                  }1||z  t          j        d|          z   }2| |z  t          j        d|          z   }3|||2dddf         z  z   ||3dddf         z  z   }4|2dddf         |k     |3dddf         |k     z  }5t          j        |4|1|5           dS )zTriton-accelerated function used to perform linear operations (dot
    product) on input tensors `A` and `B` with block-wise quantization, and
    store the result in output tensor `C`.
    r	   r+   NrD           maskotherrY   )r-   r.   rG   minr/   zerosr2   ranger0   dotr5   r6   r    r1   r!   r7   )6ABCAsBsMNKgroup_ngroup_k	stride_am	stride_ak	stride_bk	stride_bn	stride_cm	stride_cnstride_As_mstride_As_kstride_Bs_kstride_Bs_nrR   rS   rT   rU   r;   	num_pid_m	num_pid_nnum_pid_in_groupgroup_idfirst_pid_mgroup_size_mpid_mpid_noffs_amoffs_bnoffs_ka_ptrsb_ptrsAs_ptrsoffs_bsnBs_ptrsaccumulatorkabk_startoffs_ksa_sb_scoffs_cmoffs_cnc_ptrsc_masks6                                                         r   _w8a8_block_fp8_matmulr   w   s   J -Q


C<((I<((I#i/&&H\)Ky;.==L3-.E##4E|#bi<&@&@@AEG|#bi<&@&@@AEGYq,''F'!!!T'"Y.aaa91LLMF&D/I-aaa0@90LLMF7[((G'!H8k))G(L,7rzJJJK1bga..// + +GFaaa1q<7G3G!GsSSSGF41q<7G3G!GsSSSl"W$gg+ 5566gg+ 5566rva||c!!!T'l2Sqqq\AA,**,**wR[((NN2;''	
	rz	)	)NN2:&&NN2:&&l"RYq,%?%??Gl"RYq,%?%??GWQQQW---	GD!!!G<L0LLFaaag"wtQQQw'7!';<FHVQV$$$$$$rA   c                 @   t          j        d          }t          j        ||          }t          j        ||          }||z  }||z  }||z  }t          ||z
  |          }|||z  z   }||z  |z  }||z  t          j        d|          z   |z  }||z  t          j        d|          z   |z  }t          j        d|          }| |dddf         |
z  |dddf         |z  z   z   } ||dddf         |z  |dddf         |z  z   z   }!t          j        |          }"t          j        |          }#t          j        ||ft           j                  }$t          dt          j        ||                    D ]}%t          j        | |dddf         ||%|z  z
  k     d          }&t          j        |!|dddf         ||%|z  z
  k     d          }'|$t          j	        |&|'          |"z  |#z  z  }$| ||z  z  } |!||z  z  }!|j
        j        t           j        k    r |$                    t           j                  }(nY|j
        j        t           j        k    r |$                    t           j                  }(n|$                    t           j                  }(||z  t          j        d|          z   })||z  t          j        d|          z   }*|||)dddf         z  z   ||*dddf         z  z   }+|)dddf         |k     |*dddf         |k     z  },t          j        |+|(|,           dS )zTriton-accelerated function used to perform linear operations (dot
    product) on input tensors `A` and `B` with per-tensor quantization, and
    store the result in output tensor `C`.
    r	   r+   NrD   rW   rX   r[   )r-   r.   rG   r\   r/   r0   r]   r2   r^   r_   r5   r6   r    r1   r!   r7   )-r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rR   rS   rT   rU   r;   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   scale_ascale_br   r   r   r   r   r   r   r   r   s-                                                r   !_w8a8_block_fp8_matmul_per_tensorr      sP   B -Q


C<((I<((I#i/&&H\)Ky;.==L3-.E##4E|#bi<&@&@@AEG|#bi<&@&@@AEGYq,''F'!!!T'"Y.aaa91LLMF&D/I-aaa0@90LLMFgbkkGgbkkG(L,7rzJJJK1bga..// + +GFaaa1q<7G3G!GsSSSGF41q<7G3G!GsSSSrva||g-77,**,**wR[((NN2;''	
	rz	)	)NN2:&&NN2:&&l"RYq,%?%??Gl"RYq,%?%??GWQQQW---	GD!!!G<L0LLFaaag"wtQQQw'7!';<FHVQV$$$$$$rA   r`   ra   rc   rd   c                    |d\  }}n%t          |          dk    sJ |d         |d         }}||j        d         k    r||j        d         k    rd}d}| j        d         |j        d         k    sJ |                                dk    rg| j        dd         |j        dd         k    r|                                 sJ t	          j        | j        d         |          |j        d         k    sJ |                                 | j        d         z  |j        \  }|j        dk    r|                                sJ |                                dk    r|j        dk    sJ t	          j        |          |j        d         k    sJ  d	| d	|j                     t	          j        ||          |j        d         k    sJ | d	| d	|j                     | j        dd         fz   }	|                     |	|
          }
d}|k     r$t	          j                  }t          |d          }|}||z  dk    sJ |}fd}|                                dk    r|                                dk    rt          |         | ||
||||||                     d          |                     d          |                    d          |                    d          |
                    d          |
                    d          |||d           nt          |         | ||
||||||                     d          |                     d          |                    d          |                    d          |
                    d          |
                    d          |                    d          |                    d          |                    d          |                    d          |||d           |
S )a  This function performs matrix multiplication with block-wise
    quantization.
    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
    The output is returned in the specified `output_dtype`.
    Args:
        A: The input tensor, e.g., activation.
        B: The input tensor, e.g., weight.
        As: The per-token-group quantization scale for `A`.
        Bs: The per-block quantization scale for `B`.
        block_size: The block size for per-block quantization. It should
        be 2-dim, e.g., [128, 128].
        output_dytpe: The dtype of the returned tensor.
    Returns:
        torch.Tensor: The result of matmul.
    N)r   r   r   r	   r   rC   r   , rD      c                 t    t          j        | d                   t          j        | d                   z  fS )NrR   rS   )rF   rG   )METAre   rf   s    r   rJ   z*w8a8_block_fp8_matmul_triton.<locals>.gridQ  s3    AtN344v{1d>FZ7[7[[]]rA      )rR   rS   rT   rU   )r"   rL   rH   rK   rF   rG   ndimrO   next_power_of_2r3   r   strider   )r`   ra   rc   rd   r   r   block_nblock_krg   C_shaperb   rR   rT   rS   rJ   re   rf   s                  @@r   w8a8_block_fp8_matmul_tritonr     s	   . #:!####%a=*Q- !'"+'QWR["8"872;!'"+%%%%	xxzzQwss|rx},,1B1B,,,{172;00BHRL@@@@			QWR[ A7DAq6Q;;1??,,;;;	xxzzQw!||||{1g&&"(1+555!7T7Tw7T7T"(7T7T555{1g&&"(1+555!7T7Tw7T7T"(7T7T555gcrclaT!G	G<00AL<-a00<,,L\!Q&&&&L^ ^ ^ ^ ^ ^ 
xxzzQ288::??)$/HHRLLHHRLLHHQKKHHQKKHHRLLHHRLL%%%)	
 	
 	
 	
 	
. 	t$HHRLLHHRLLHHQKKHHQKKHHRLLHHRLLIIbMMIIbMMIIaLLIIaLL%%%1	
 	
 	
 	
6 HrA   c                 B   t          ||          rt                      }|	 | j        }|                                 | j        d         z  }| j        d         }	|j        d         }
|	dz  dk    s	|
dz  dk    rt	          d|	 d|
 d          |                     ||	                                          }|                                                                }|                    |d                                          }|                                                                                                }|                                                                }|                                                                                                }|                    |||||d          }|dd         |
fz   }|                    |          S # t          $ r(}t                              d| d	           Y d}~nd}~ww xY wt          | |||||          S )
a  
    Dispatch to CUTLASS or Triton for block-wise FP8 matmul.

    Uses CUTLASS when:
    - Block size is [128, 128] (the only size CUTLASS supports)
    - Running on SM90+ (Hopper or newer)
    - The CUTLASS kernel is available
    - Output dtype is bfloat16 or float16 (CUTLASS requirement)
    - Tensor dimensions are compatible (divisible by 16)

    Otherwise falls back to Triton.
    NrC   r	   r   zCUTLASS requires K (z	) and N (z) divisible by 16zCUTLASS kernel failed: r   )r(   r   rL   rH   
ValueErrorview
contiguoustcutlass_scaled_mmr   r   r   r   )r`   ra   rc   rd   r   r   r'   original_shapere   rg   rf   A_2dB_col_majorAs_2dBs_kmrb   r   r   s                     r   w8a8_block_fp8_matmulr     s   * \22 0\)++-\ "#GGII,GBKGAJ r6Q;;!b&A++$%\A%\%\%\%\%\]]]vva||..00  llnn..00 21133		,,..0022 ))++		,,..0022 ,,T;ul\`aa("-4vvg& \ \ \##$Za$Z$Z$Z[[[[[[[[\ (1b"j,OOOs   F3G 
H
"HH
input_qweight_qinput_scaleweight_scalec                    | j         dk    r| j        nd| j        d         | j        d         f\  }}}|j        d         }	|                     d|          }
|                    |j        d         d          }|	|d         z  }||d         z  }t          j        ||z  |	ft          j        | j                  }t          |          D ]}||d         z  }||d         z   }t          |          D ]}||d         z  }||d         z   }|
dd||f         }|||||f         }|dd||dz   f         }|||f         }t          j        ||	                                t          j
        dt          j        | j                  ||          |z  }|dd||fxx         |z  cc<   |                    |||	          }|                    |          S )a  
    Performs blocked matrix multiplication with FP8 quantized matrices.

    Args:
        input_q: Quantized input tensor with 1x128 block quantization
        weight_q: Quantized weight tensor with 128x128 block quantization
        input_scale: Scaling factors for input blocks
        weight_scale: Scaling factors for weight blocks
        block_size: Tuple of (M, N) for weight block dimensions
        output_dtype: Desired output dtype
       r   r	   rC   )r5   deviceN)r   r   	out_dtype)r   rL   r   r   r]   r2   r   r^   
_scaled_mmr   tensorr1   )r   r   r   r   r   r   
batch_sizeseq_len
hidden_dimout_featuresinput_reshapedinput_scale_reshapednum_weight_blocks_mnum_weight_blocks_noutputim_startm_endjn_startn_endinput_blockweight_blockcurr_input_scalecurr_weight_scaleblock_results                             r   w8a8_block_fp8_matmul_compiler     s4   ( 8?|q7H7HgmmqRYR_`aRbdkdqrsdtNu#J>!$L \\"j11N&++K,=a,@"EE&*Q-7$
15[*w.=U][b[ijjjF&'' 5 5jm#*Q-'*++ 	5 	5A*Q-'Gjm+E )GEM)9:K#GEM75=$@AL  4AAAq1q5yLA ,QT 2   NN$$!L%-WWW-*   ##  111gem#$$$4$$$$/	52 [[Wl;;F99\"""rA   c                   ~     e Zd Zdej        ddfdedededeeef         dz  f fdZd	ej	        d
ej	        fdZ
 xZS )	FP8LinearFNdynamicin_featuresr   biasr   c                 d   t                                          ||           || _        || _        t          j                            t	          j        |||                    | _        | j        8t          j        t	          j	        dt          j
                            | _        nz|| j        d         z   dz
  | j        d         z  }|| j        d         z   dz
  | j        d         z  }t          j        t	          j        ||t          j
                            | _        | j        dk    r7t          j        t	          j	        dt          j
                            | _        |r2t          j        t	          j        | j                            | _        d S |                     dd            d S )NrD         ?r	   r   staticr   )super__init__r   activation_schemer   nn	Parameteremptyweightr   r2   weight_scale_invactivation_scaler   r   register_parameter)
selfr   r   r   r5   r   r   scale_out_featuresscale_in_features	__class__s
            r   r   zFP8Linear.__init__  st    	l333 %!2h((\;V[)\)\)\]]?"$&Lc1W1W1W$X$XD!!".1C"Ca"GDO\]L^!^!,tq/A!AA!E$/Z[J\ \$&L.0AWWW% %D! !X--$&Lc1W1W1W$X$XD! 	2U[1B%C%CDDDIII##FD11111rA   inputr   c           	         | j                                         dk    r t          j        || j         | j                  S t          | j         t          j        j        j	                  r=| j         j
                                        }| j        j
                                        }n2| j                                         }| j                                        }t                      r#t          j                                        j        nd}t#          t          |t          j                  }|                    |j                  5  | j        dk    rt+          || j        d                   \  }}n| j        dk    rf| j                            t          j                  }||z                      t6          t8                                        t          j                  }nt=          d          t?          ||||| j        |j                   }d d d            n# 1 swxY w Y   |!                                 | j        
|| j        z   }|                    |j                   S )	Nr   r   r   r   r\   r3   zNot supportedr   rD   )"r   element_sizeFlinearr   
isinstancer   distributedr   DTensor_local_tensorr   r   r   acceleratorcurrent_acceleratortypegetattrr   r   r   rQ   r   r   r1   r2   clamp_FP8_MIN_FP8_MAXrN   NotImplementedErrorr   r5   synchronize)	r   r   r   	scale_invdevice_typetorch_accelerator_moduleqinputscaler   s	            r   forwardzFP8Linear.forward;  sE   ;##%%))8E4;	:::$+u'8'?'GHH ?2==?? 1?JJLL		//11 1<<>>	JhJjJjv%+??AAFFpvK'.uk5:'N'N$)00>>  )Y66$-eT_Q5G$H$HMFEE+x77 144U]CCE#em22xX2NNQQRWReffFF .o>>>.O!&                * %00222y$$)+995;9///s   C
HH H)__name__
__module____qualname__r   rN   intbooltupler   Tensorr   __classcell__r   s   @r   r   r     s        
 !-1# 2  2 2  2 	 2 #s(Od* 2  2  2  2  2  2D&0U\ &0el &0 &0 &0 &0 &0 &0 &0 &0rA   r   c                     | |z   dz
  |z  S )Nr    )r   r   s     r   	_ceil_divr
  d  s    EAI!rA   c                        e Zd Zej        f fd	Zdej        dej        dej        dej        fdZdej        dej        d	ej        dej        fd
Z xZ	S )	FP8Expertc                    t                                                       ddlm} || _        t          |d          r|j        n|j        | _        |j        | _	        t          |d          r|j
        n|j        | _        d| j        z  | j	        }}| j	        | j        }}t          j        t          j        | j        |||                    | _        t          j        t          j        | j        |||                    | _        | j        \  }	}
t'          ||	          }t'          ||
          }t          j        t          j        | j        ||t          j                            | _        t'          ||	          }t'          ||
          }t          j        t          j        | j        ||t          j                            | _        |                     dd            |                     dd            ||j                 | _        d S )Nr   )ACT2FNnum_local_expertsmoe_intermediate_sizerD   gate_up_bias	down_bias)r   r   activationsr  r   hasattrr  num_expertshidden_sizer   r  intermediate_sizeintermediate_dimr   r   r   r]   gate_up_proj	down_projr
  r2   gate_up_proj_scale_invdown_proj_scale_invr   
hidden_actact_fn)r   configr   r5   r  Wg_outWg_inWd_outWd_inbobi
gu_scale_o
gu_scale_i
dp_scale_o
dp_scale_ir   s                  r   r   zFP8Expert.__init__i  s   (((((($7>vGZ7[7[s633agas ,,3F<S,T,TrF((Z`Zr 	 D114?)>LT5Evu\a)b)b)bccek$2BFEY^&_&_&_``B vr**
ub))
&(lK(*jVVV'
 '
#
 vr**
ub))
#%<K(*jVVV$
 $
 
 	555T222 V./rA   hidden_statestop_k_indextop_k_weightsr   c                    t          j        |          }t          j                    5  t           j        j                            || j                  }|                    ddd          }t          j        |	                    d          d          
                                }d d d            n# 1 swxY w Y   |D ]*}|d         }|t          | j                  k    r$t          j        ||                   \  }}	||	         }
|                     |
| j        |         | j        |                                       dd          \  }}|                     |          |z  }|                     || j        |         | j        |                   }||	|d f         }||                    |j                  z  }|                    d|	|                    |j                             ,|S )N)num_classesr   r   r	   )rC   r   dimrC   )r   
zeros_likeno_gradr   r
   one_hotr  permutegreatersumnonzeror"   r  wherer   r  chunkr  r  r  r1   r5   
index_add_)r   r*  r+  r,  final_hidden_statesexpert_mask
expert_hit
expert_idx	top_k_pos	token_idxcurrent_stategateupcurrent_hidden_statesrouting_weightss                  r   r   zFP8Expert.forward  s,    $.}==]__ 	S 	S(-55ktO_5``K%--aA66K{8'D'DaHHPPRRJ	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S
 % 	n 	nJ#AJS!23333#(;{:/F#G#G Iy))4M{{t0<d>YZd>e eA2e D" %)KK$5$5$:!$(KK%t~j'A4C[\fCg% %! ,Iy$,FGO$9O<N<NOdOj<k<k$k!**1i9N9Q9QReRk9l9lmmmm""s   A>B22B69B6r   r   r   c           	      @   |                                 dk    rt          j        ||d           S t                      r#t          j                                        j        nd}t          t          |t          j	                  }|
                    |j
                  5  t          || j        d                   \  }}t          ||||| j        |j                  }d d d            n# 1 swxY w Y   |                                 |                    |j                  S )Nr   r   r   rD   )r   r   r   r   r   r   r   r   r   r   r   rQ   r   r   r5   r   r1   )	r   r   r   r   r   r   r   r   r   s	            r   r   zFP8Expert.linear  sF     1$$8E64000 KiJjJjv%+??AAFFpvK'.uk5:'N'N$)00>> 	 	 )%1C D D.$O!&  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 %00222995;9///s   >C&&C*-C*)
r   r   r  r   rN   r   r  r   r   r  r  s   @r   r  r  h  s        161D )0 )0 )0 )0 )0 )0\#|# \# |	#
 
# # # #@0EL 0%, 0RWR^ 0chco 0 0 0 0 0 0 0 0rA   r  Fmodules_to_not_convertc                 6   |j         r| S d}|                                 D ]\  }}t          ||          s|ri nddi}d}t          j        d          5  |                    d          rt          d
| j        |j        d|}nGt          |t          j                  r-t          d
|j        |j        |j        du|j        |j        d|}||                     ||           d}ddd           n# 1 swxY w Y   |st$                              d	           | S )a  
    A helper function to replace all `torch.nn.Linear` modules by `FP8Linear` modules.

    Parameters:
        model (`torch.nn.Module`):
            Input model or `torch.nn.Module` as the function is run recursively.
        modules_to_not_convert (`list[`str`]`, *optional*, defaults to `None`):
            Names of the modules to not convert. In practice we keep the `lm_head` in full precision for numerical stability reasons.
        quantization_config (`FbgemmFp8Config`):
            The quantization config object that contains the quantization parameters.
        pre_quantized (`book`, defaults to `False`):
            Whether the model is pre-quantized or not
    Fr5   NrI   z.experts)r  r   )r   r   r   r   r   TzYou are loading your model using fp8 but no linear modules were found in your model. Please double check your model architecture.r	  )
dequantizenamed_modulesr   r   r   endswithr  r  weight_block_sizer   r   Linearr   r   r   r   r   set_submoduler   warning)	modelrG  quantization_configpre_quantizedhas_been_replacedmodule_namemodulemodule_kwargs
new_modules	            r   replace_with_fp8_linearrX    s   " % $2244 ) )V$[2HII 	+@'4
\&!! 	) 	)##J// &  <4G4Y ]j 

 FBI.. &  & 2!'!4D0&9&K2D  $ 
 %##K<<<$(!!	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	)$  
<	
 	
 	
 Ls   BC11C5	8C5	c                   L    e Zd ZdZd Zdej        deeej        f         fdZ	dS )Fp8Quantizez^
    A quantization operation that creates two tensors, weight and scale out of a weight.
    c                     || _         d S Nhf_quantizerr   r^  s     r   r   zFp8Quantize.__init__      (rA   
input_dictr   c                    t          |                                          d         \  }}|d         }d }| j        j        Zt	          | j        j        t
                    r | j        j                            d          }nt          | j        j        dd           }||j        d         |j        d         f}|\  }}|j        d         |j        d         }	}||z  dk    s	|	|z  dk    rt          d| d|	 d| d| d| 
          |j        d d         }
||z  }|	|z  }|j        }|
                    t          j                  } |j        g |
||||R  }|                                                    d	
          }t          j        |dk    |t          j        |                    }t$          |z  }t          j        |dk    |t          j        |                    }|                    d                              d          }||z  }t          j        |t*          t$                    
                    t,                    }|                    |          }d|z  
                    t          j                  }|                    d          r |                    dd          d         dz   }n|dz   }||||iS )Nr	   rL  r   rC   Matrix dimensions (r   $) must be divisible by block sizes (z). for )rC   r/  re  r   r   r   .r   z.weight_scale_inv
_scale_inv)r  itemsr^  rQ  r   dictgetr   rL   r   r1   r   r2   reshaper4   amaxr8  	ones_liker   	unsqueezer   r   
_FP8_DTYPErK  rsplit)r   ra  kwargstarget_keysvaluer   block_mr   rowscolsleading_shape
rows_tiles
cols_tilesr   
value_fp32reshapedmax_abssafe_max_absscalesscales_broadcastscaled	quantized
inv_scales	scale_keys                           r   convertzFp8Quantize.convert  s   ":#3#3#5#566q9Ua 
0<$+?FF g!.BFFGZ[[

$T%6%JL_aeff
+b/5;r?;J%[_ek"od '>Q$.A"5"5 Ad  A  Ad  A  AX_  A  Acj  A  As~  A  A  
 CRC(W_
W_
XXem,,
 &:%_}_j_'_:_W^___ ,,..%%(%33{7Q;9Q9QRR L(Wq[&%/&2I2IJJ "++B//99"==,,KH(CCCFFzRR	%%n55	Fl&&u}55
)) 	3#**322158KKII#l2I z
 	
rA   N)
r   r   r  __doc__r   r   r  ri  strr  r	  rA   r   rZ  rZ    s]         ) ) )?
%, ?
T#u|BS=T ?
 ?
 ?
 ?
 ?
 ?
rA   rZ  c            	       j    e Zd ZdZd Z	 ddeeej        f         dedz  deeej        f         fdZ	dS )	Fp8DequantizeziInverse operation of :class:`Fp8Quantize`. Takes a pair (weight, scale) and reconstructs the fp32 tensor.c                     || _         d S r\  r]  r_  s     r   r   zFp8Dequantize.__init__P  r`  rA   Nra  full_layer_namer   c                    t          |          dk     r
||d         iS |d         d         }|d         d         }|j        dd          \  }}| j        j        j        }||j        d         |j        d         f}|\  }	}
||	z  dk    s	||
z  dk    rt          d| d| d	|	 d|
 d
	          |                    |j                  }|                    d||	z  |	||
z  |
          }|                    d||	z  ||
z            }|	                    d          	                    d          }||z  }||                    |j                  iS )Nr   zweight$r	   r   r   rC   rc  r   rd  z).)
r"   rL   r^  rQ  rL  r   r1   r5   rk  rn  )r   ra  r  rq  r  r~  ru  rv  r   rt  r   r{  expanded_scalesdequantizeds                 r   r  zFp8Dequantize.convertS  s    z??Q#Z	%:;;y)!,	./2_RSS)
d&:L
#/"-yr/BCJ%'>Q$.A"5"5ndnndnnX_nncjnnn   LL..	$$R'47?T[\\ ..TW_dgoNN)33B77AA!DD0 [00AA
 	
rA   r\  )
r   r   r  r  r   ri  r  r   r  r  r	  rA   r   r  r  M  s        ss) ) ) '+ 
  
el*+ 
 t 

 
c5<	  
  
  
  
  
  
rA   r  )r   )NNF)9core_model_loadingr   quantizers.quantizers_utilsr   utilsr   r   r   r   r   torch.nnr   rF   triton.languagelanguager-   r
   r   
get_loggerr   r   r   r   listr  r5   r  r(   rN   ro  finfor\   r   r3   r   jit	constexprr@   r  r  rQ   r   r   r2   r   r   compiler   rM  r   r
  Moduler  r  rX  rZ  r  r	  rA   r   <module>r     s   / . . . . . ? ? ? ? ? ? e e e e e e e e e e e e  )LLLMMM      (((((( 
	H	%	%  B B B$$s)d"2 $%+ $RV $ $ $ $N  
5;z""&5;z""& bl    
 
 
3 
u|U\?Y9Z 
 
 
 
 Q%4 ,5Q%6 ,7Q%8 ,9Q%: ,;Q% Q% Q% Q%h E%, ,-E%. ,/E%0 ,1E%2 ,3E% E% E% E%\ !&r r|r|r 	r 		r
 S	r +r \r r r rv !&HP HP|HP|HP 	HP 		HP
 S	HP +HP \HP HP HP HPX  *. %># >#\>#l># ># ,	>#
 c3h$&># +># \># ># ># >#BI0 I0 I0 I0 I0	 I0 I0 I0X  c0 c0 c0 c0 c0	 c0 c0 c0N ej2 2#'9t#32 2 2 2jG
 G
 G
 G
 G
- G
 G
 G
T&
 &
 &
 &
 &
M &
 &
 &
 &
 &
rA   