
    bi
%                        d dl mZ d dlmZ ddlmZmZ ddlmZ ddl	m
Z
 ddlmZ  e            rd d	lZerdd
lmZ  ej        e          Z G d de
          Zd	S )    )annotations)TYPE_CHECKING   )is_torch_availablelogging)
SinqConfig   )HfQuantizer)get_module_from_nameN)PreTrainedModelc                       e Zd ZU dZdZded<   d  fdZd!d	Zed!d
            Z	d Z
d"dZd#dZd$dZd%dZd Zd Z	 d&d'dZd(dZ xZS ))SinqHfQuantizera  
    HF v5 quantizer for SINQ.

    Modes:
      - method="sinq" (default):
          * weight-only SINQ
          * param-level ConversionOps (`SinqQuantize`) during load for pure language models
            (each Linear.weight is turned into a SINQLinear module)
          * module-level quantization after load for multimodal models
      - method="asinq":
          * A-SINQ (activation-aware) SINQ quantization
    Tbool requires_parameters_quantizationquantization_configr   c                X     t                      j        |fi | d | _        d| _        d S )NF)super__init___normalized_device_str_do_param_level_sinq)selfr   kwargs	__class__s      a/root/projects/butler/venv/lib/python3.11/site-packages/transformers/quantizers/quantizer_sinq.pyr   zSinqHfQuantizer.__init__1   s9    ,7777726#*/!!!    returnc                    dS NT r   s    r   is_serializablezSinqHfQuantizer.is_serializable7   s    tr   c                    dS r   r   r    s    r   is_trainablezSinqHfQuantizer.is_trainable:   s    tr   c                    |at           j                                        r!dt           j                                        i}nddi}t                              d| d           |S )N cpuz:The device_map was not initialized. Setting device_map to zJ. If you want to use the model for inference, please set device_map='auto')torchcudais_availablecurrent_deviceloggerinfo)r   
device_maps     r   update_device_mapz!SinqHfQuantizer.update_device_map>   sz    z&&(( ) %*";";"="=>

 %[
KK[)3[ [ [  
 r   dtypetorch.dtypec                0    |t           j        }|| _        |S N)r'   bfloat16r/   )r   r/   s     r   update_dtypezSinqHfQuantizer.update_dtypeK   s    =NE
r   Nonec                    ddl m}  |            st          d          t          j                                        st                              d           |                    d          }t          |t                    rTt          |                                          }t          |          dk    r t          dt          |           d          | j        j        d	k    r| j        st'          d
          d S d S )Nr   )is_sinq_availablezMThe 'sinq' package is not installed. Please install it with: pip install sinqzNo CUDA device is available. Quantization and inference will run on the CPU. Please note that this will significantly slow down inference speed and increase quantization time.r-   r	   zkSinqHfQuantizer: multi-GPU device_map detected, but SINQ currently supports only a single CUDA device. Got z. Please use device_map=None.asinqzYou are using `method='asinq'` in the quantization config. Right now the calibrated version of SINQ is not supported in Hugging Face, please refer and use the official SINQ repository `to quantize a model with this method. )utilsr7   ImportErrorr'   r(   r)   r+   warningget
isinstancedictsetvalueslenRuntimeErrorsortedr   methodpre_quantized
ValueError)r   argsr   r7   r-   device_map_valuess         r   validate_environmentz$SinqHfQuantizer.validate_environmentQ   s0   ------  "" 	omnnnz&&(( 	NN B   ZZ--
j$'' 	 #J$5$5$7$7 8 8$%%))"\#)*;#<#<\ \ \  
 #*g55d>P5:   6555r   cfgr>   c                    ddl m} |j        } |t          |j                  |j        t          |j                  ndddddt          |j                  |          S )zI
        Build the dict that SINQLinear expects as quant_config.
        r   )sinq_base_quant_configNFr	   )nbits
group_size
quant_zeroquant_scaleview_as_floataxistiling_moderD   )sinq.sinqlinear_hfrL   rD   intrM   rN   strrS   )r   rJ   sinq_base_quant_config_fnrD   s       r   _build_sinq_quant_dictz&SinqHfQuantizer._build_sinq_quant_dictm   sx     	[ZZZZZ((ci...1n.Hs3>***dCO,,	
 	
 	
 		
r   modelr   
param_namerV   c                    ddl m} | j        rdS | j        j        dk    rdS | j        sdS t          ||          \  }}|dk    rdS t          ||          }t          |dd          }|o| }	|	S )a-  
        Called per-parameter to decide whether to run `SinqQuantize` on it.

        - If `self.pre_quantized`, we do *not* quantize again (handled by SinqDeserialize instead).
        - For method="asinq": return False (ASINQ is not supported in Hugging Face).
        - For method="sinq": True only for SINQLinear.weight not in modules_to_not_convert.

        Note: After _process_model_before_weight_loading(), the modules are already SINQLinear,
        not nn.Linear. We check for SINQLinear modules that are not yet quantized (ready=False).
        r   )
SINQLinearFr8   weightreadyT)	rT   r\   rE   r   rD   r   r   r=   getattr)
r   rY   rZ   r   r\   moduletensor_nameis_sinqis_readyresults
             r   param_needs_quantizationz(SinqHfQuantizer.param_needs_quantization   s     	211111 	5#*g555 ( 	525*EE(""5 VZ0067D11)\r   c                $    ddl m}  ||           S )z
        Return the ConversionOps used for param-level quantization (Sinq).
        The actual SINQLinear construction is in integrations/sinq.py.
        r   )SinqQuantize)integrations.sinqrg   )r   rg   s     r   get_quantize_opsz SinqHfQuantizer.get_quantize_ops   s&    
 	544444|D!!!r   c                d    ddl m} | j        r"ddlm}  |g ddg ||           g          gS g S )a4  
        If `pre_quantized=True`, interpret a checkpoint produced by SINQLinear.state_dict:

            <prefix>.W_q
            <prefix>.bias
            <prefix>.meta

        via a WeightConverter + SinqDeserialize so that we reconstruct a SINQLinear
        module instead of a plain nn.Linear.
        r   )WeightConverter)SinqDeserialize)z.W_qz.metaz.biasz.weight)source_patternstarget_patterns
operations)core_model_loadingrk   rE   rh   rl   )r   rk   rl   s      r   get_weight_conversionsz&SinqHfQuantizer.get_weight_conversions   s     	988888 	;;;;;;  % % %
 &/K / 5 56  
 
 	r   Nkeep_in_fp32_moduleslist[str] | Nonec                D   ddl m} |                     || j        j        pg |          | _        | j        j        dk    o| j         | _        | j        rdn|                     | j                  }t          |t                    rZt          t          |                                          d          }t          |t                    rd| }n2t          |          }n"t           j                                        rdnd} ||| j        || j        || j        	          }dS )
a  
        Called on meta-initialized model, before loading any weights.

        For SINQ, we replace nn.Linear modules with empty SINQLinear modules here.
        The actual quantization happens later in SinqQuantize.convert() when weights are loaded.
        r   )replace_with_sinq_linearsinqNr   zcuda:zcuda:0r&   )modules_to_not_convertquant_configcompute_dtypedevicerE   )rh   ru   get_modules_to_not_convertr   rw   rD   rE   r   rX   r=   r>   nextiterr@   rU   rV   r'   r(   r)   r/   )	r   rY   r-   rr   r   ru   sinq_quant_dictfirst_device
device_strs	            r   $_process_model_before_weight_loadingz4SinqHfQuantizer._process_model_before_weight_loading   s?    	A@@@@@&*&E&ED,CIrL`'
 '
#
 %)$<$Cv$M$hVZVhRh!"&"4o$$$:U:UVZVn:o:o j$'' 	JZ%6%6%8%8 9 91==L,,, /3\33

 ..

%*Z%<%<%>%>IEJ((#'#>(*,
 
 
r   c                &    ddl m}  |             |S )aq  
        Called after *all* weights have been loaded.

        For SINQ:
        1. Move non-SINQLinear modules to GPU (embeddings, norms, lm_head, etc.)
           - SINQLinear modules already have GemLite buffers on GPU
           - We skip moving SINQLinear's W_q/meta to avoid memory duplication
        2. Patch HF save/load methods for SINQ serialization
        r   )patch_hf_pretrained_io)
sinq.hf_ior   )r   rY   r   r   s       r   #_process_model_after_weight_loadingz3SinqHfQuantizer._process_model_after_weight_loading   s,     	655555 	   r   )r   r   )r   r   )r/   r0   r   r0   )r   r5   )rJ   r   r   r>   )rY   r   rZ   rV   r   r   r2   )rY   r   rr   rs   )rY   r   )__name__
__module____qualname____doc__r   __annotations__r   r!   propertyr#   r.   r4   rI   rX   re   ri   rq   r   r   __classcell__)r   s   @r   r   r   !   sC          .2$11110 0 0 0 0 0       X        8
 
 
 
$       D" " "  B 26	)
 )
 )
 )
 )
V       r   r   )
__future__r   typingr   r9   r   r   utils.quantization_configr   baser
   quantizers_utilsr   r'   modeling_utilsr   
get_loggerr   r+   r   r   r   r   <module>r      s    # " " " " "             / / / / / / / / 2 2 2 2 2 2       2 2 2 2 2 2  LLL 1000000		H	%	%d d d d dk d d d d dr   