
    bi.                       d dl mZ d dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 	 d dlm
Z
 n# e$ r	 d dlm
Z
 Y nw xY wd dlZd dlZd dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ  ej        e          Z G d de          ZdS )    )annotationsN)Path)Any)Self)	save_file)	Tokenizer)nn)PreTrainedTokenizerFast)InputModule)get_device_namec                       e Zd Z	 	 d;d< fd
Zd=dZd>dZed?d            Zd?dZddd@dZ	e
	 	 	 	 	 dAdBd&            Ze
	 	 	 	 	 	 	 	 dCdDd8            Ze
dEd:            Z xZS )FStaticEmbeddingN	tokenizer#Tokenizer | PreTrainedTokenizerFastembedding_weights np.ndarray | torch.Tensor | Noneembedding_dim
int | NonereturnNonec                   t                                                       t          |t                    r|j        }n$t          |t
                    st          d          |Ut          |t          j                  rt          j
        |          }t          j                            |d          | _        n>|-t          j        |                                |          | _        nt          d          | j        j        | _        | j        j        | _        || _        | j                                         |                    dd          | _        dS )a4	  
        Initializes the StaticEmbedding model given a tokenizer. The model is a simple embedding bag model that
        takes the mean of trained per-token embeddings to compute text embeddings.

        Args:
            tokenizer (Tokenizer | PreTrainedTokenizerFast): The tokenizer to be used. Must be a fast tokenizer
                from ``transformers`` or ``tokenizers``.
            embedding_weights (np.ndarray | torch.Tensor | None, optional): Pre-trained embedding weights.
                Defaults to None.
            embedding_dim (int | None, optional): Dimension of the embeddings. Required if embedding_weights
                is not provided. Defaults to None.

        .. tip::

            Due to the extremely efficient nature of this module architecture, the overhead for moving inputs to the
            GPU can be larger than the actual computation time. Therefore, consider using a CPU device for inference
            and training.

        Example::

            from sentence_transformers import SentenceTransformer
            from sentence_transformers.models import StaticEmbedding
            from tokenizers import Tokenizer

            # Pre-distilled embeddings:
            static_embedding = StaticEmbedding.from_model2vec("minishlab/potion-base-8M")
            # or distill your own embeddings:
            static_embedding = StaticEmbedding.from_distillation("BAAI/bge-base-en-v1.5", device="cuda")
            # or start with randomized embeddings:
            tokenizer = Tokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
            static_embedding = StaticEmbedding(tokenizer, embedding_dim=512)

            model = SentenceTransformer(modules=[static_embedding])

            embeddings = model.encode(["What are Pandas?", "The giant panda, also known as the panda bear or simply the panda, is a bear native to south central China."])
            similarity = model.similarity(embeddings[0], embeddings[1])
            # tensor([[0.8093]]) (If you use potion-base-8M)
            # tensor([[0.6234]]) (If you use the distillation method)
            # tensor([[-0.0693]]) (For example, if you use randomized embeddings)

        Raises:
            ValueError: If the tokenizer is not a fast tokenizer.
            ValueError: If neither `embedding_weights` nor `embedding_dim` is provided.
        zThe tokenizer must be fast (i.e. Rust-backed) to use this class. Use Tokenizer.from_pretrained() from `tokenizers` to load a fast tokenizer.NF)freezez?Either `embedding_weights` or `embedding_dim` must be provided.
base_model)super__init__
isinstancer
   
_tokenizerr   
ValueErrornpndarraytorch
from_numpyr	   EmbeddingBagfrom_pretrained	embeddingget_vocab_sizenum_embeddingsr   r   
no_paddinggetr   )selfr   r   r   kwargs	__class__s        g/root/projects/butler/venv/lib/python3.11/site-packages/sentence_transformers/models/StaticEmbedding.pyr   zStaticEmbedding.__init__   s9   f 	i!899 	!,IIIy11 	^  
 (+RZ88 H$)$45F$G$G!_<<=NW\<]]DNN&_Y-E-E-G-GWWDNN^___"n;!^9$-!!### !**\488    texts	list[str]dict[str, torch.Tensor]c           	     (   | j                             |d          }d |D             }t          j        t	          j        dgd |d d         D             z                       }t          j        d |D             t          j                  }||d	S )
NF)add_special_tokensc                    g | ]	}|j         
S  )ids).0encodings     r-   
<listcomp>z,StaticEmbedding.tokenize.<locals>.<listcomp>o   s    @@@(@@@r.   r   c                ,    g | ]}t          |          S r5   )len)r7   	token_idss     r-   r9   z,StaticEmbedding.tokenize.<locals>.<listcomp>q   s    3g3g3gyC	NN3g3g3gr.   c                    g | ]	}|D ]}|
S r5   r5   )r7   r<   token_ids      r-   r9   z,StaticEmbedding.tokenize.<locals>.<listcomp>r   s&    !d!d!dyZc!d!dh(!d!d!d!dr.   )dtype)	input_idsoffsets)r   encode_batchr!   r"   r   cumsumtensorlong)r*   r/   r+   	encodingsencodings_idsrB   rA   s          r-   tokenizezStaticEmbedding.tokenizem   s    N//%/PP	@@i@@@"29aS3g3gTabecebeTf3g3g3g-g#h#hiiL!d!dM!d!d!dlqlvwww	&7;;;r.   featuresc                P    |                      |d         |d                   |d<   |S )NrA   rB   sentence_embedding)r%   )r*   rJ   r+   s      r-   forwardzStaticEmbedding.forwardu   s+    )-8MxXaOb)c)c%&r.   intc                    t           j        S N)mathinfr*   s    r-   max_seq_lengthzStaticEmbedding.max_seq_lengthy   s	    xr.   c                    | j         S rP   )r   rS   s    r-    get_sentence_embedding_dimensionz0StaticEmbedding.get_sentence_embedding_dimension}   s    !!r.   T)safe_serializationoutput_pathstrrW   boolc                  |rAt          |                                 t          j                            |d                     nEt          j        |                                 t          j                            |d                     | j                            t          t          |          dz                       d S )Nzmodel.safetensorszpytorch_model.bintokenizer.json)
save_safetensors_file
state_dictospathjoinr!   saver   rY   r   )r*   rX   rW   argsr+   s        r-   rb   zStaticEmbedding.save   s     	Z!$//"3"3RW\\+Ob5c5cddddJt(("',,{DW*X*XYYYC[ 1 14D DEEFFFFFr.    Fmodel_name_or_path	subfoldertokenbool | str | Nonecache_folder
str | Nonerevisionlocal_files_onlyr   c                    |||||d} | j         |fddi|}	t          j        |	          }
 | j        dd|i|}	 |d         }n# t          $ r |d         }Y nw xY wt          |
|          S )	N)rf   rg   ri   rk   rl   filenamer\   re   zembedding.weight
embeddings)r   r5   )load_file_pathr   	from_fileload_torch_weightsKeyErrorr   )clsre   rf   rg   ri   rk   rl   r+   
hub_kwargstokenizer_pathr   weightss               r-   loadzStaticEmbedding.load   s     #(  0
 

 ,+,>hhIYh]ghh'77	(#(]]<N]R\]]	,01GG 	, 	, 	,l+GGG	, yGDDDDs   A AA   -C6?\[unused\d+\]float32
model_name
vocabularylist[str] | Nonedevicepca_dims
apply_zipfsif_coefficientfloat | Nonetoken_remove_patternquantize_touse_subwordr+   r   c
           	        	 ddl m} n# t          $ r t          d          w xY wt          j        |          }t          |j                                                  dhz
  |||||	|||d|
}
t          |
                                          z
  x}rdt          	                    dd
                    t          t          |                     d           fd	|
                                D             }
t                      } ||fi |
}t          |j        t"          j                  r,t'          j        |j                                                  }n|j        j        }|j        } | |||
          S )ak  
        Creates a StaticEmbedding instance from a distillation process using the `model2vec` package.

        Args:
            model_name (str): The name of the model to distill.
            vocabulary (list[str] | None, optional): A list of vocabulary words to use. Defaults to None.
            device (str): The device to run the distillation on (e.g., 'cpu', 'cuda'). If not specified,
                the strongest device is automatically detected. Defaults to None.
            pca_dims (int | None, optional): The number of dimensions for PCA reduction. Defaults to 256.
            apply_zipf (bool): Whether to apply Zipf's law during distillation. Defaults to True.
            sif_coefficient (float | None, optional): The coefficient for SIF weighting. Defaults to 1e-4.
            token_remove_pattern (str | None, optional): A regex pattern to remove tokens from the vocabulary.
                Defaults to r"\[unused\d+\]".
            quantize_to (str): The data type to quantize the weights to. Defaults to 'float32'.
            use_subword (bool): Whether to use subword tokenization. Defaults to True.

        Returns:
            StaticEmbedding: An instance of StaticEmbedding initialized with the distilled model's
                tokenizer and embedding weights.

        Raises:
            ImportError: If the `model2vec` package is not installed.
        r   )distillz\To use this method, please install the `model2vec` package: `pip install model2vec[distill]`r}   )r~   r   r   r   r   r   r   r   z1Your version of `model2vec` does not support the z, zh arguments for the `distill` method. Consider updating `model2vec` to take advantage of these arguments.c                $    i | ]\  }}|v 	||S r5   r5   )r7   keyvaluedistill_kwargss      r-   
<dictcomp>z5StaticEmbedding.from_distillation.<locals>.<dictcomp>   s)    [[[ZS%SNEZEZc5EZEZEZr.   r   r   )model2vec.distillr   ImportErrorinspect	signatureset
parameterskeysloggerwarningra   maprepritemsr   r   r%   r   r    r!   r"   
contiguousweightr   )rt   r}   r~   r   r   r   r   r   r   r   r+   r   distill_signature	leftoversstatic_modelr   r   r   s                    @r-   from_distillationz!StaticEmbedding.from_distillation   s   L	1111111 	 	 	n  	
 $-g66.9>>@@AA\NR$ $&&.$8

 

 

 FKKMM**^;;9 	\NNVDIIcRVXaNbNbDcDc V V V   \[[[6<<>>[[[F ""wz44V44l,bj99 	> % 01G H H S S U U , 6 =+5	s90AjYYYYs   
 $model_id_or_pathc                N   	 ddl m} n# t          $ r t          d          w xY w|                    |          }t	          |j        t          j                  r,t          j	        |j                  
                                }n|j        j        }|j        } | |||          S )aH  
        Create a StaticEmbedding instance from a model2vec model. This method loads a pre-trained model2vec model
        and extracts the embedding weights and tokenizer to create a StaticEmbedding instance.

        Args:
            model_id_or_path (str): The identifier or path to the pre-trained model2vec model.

        Returns:
            StaticEmbedding: An instance of StaticEmbedding initialized with the tokenizer and embedding weights
                 the model2vec model.

        Raises:
            ImportError: If the `model2vec` package is not installed.
        r   )StaticModelzSTo use this method, please install the `model2vec` package: `pip install model2vec`r   )	model2vecr   r   r$   r   r%   r   r    r!   r"   r   r   r   )rt   r   r   r   r   r   s         r-   from_model2veczStaticEmbedding.from_model2vec   s    "	u------- 	u 	u 	usttt	u #223CDDl,bj99 	> % 01G H H S S U U , 6 =+5	s90AN^____s   	 #)NN)r   r   r   r   r   r   r   r   )r/   r0   r   r1   )rJ   r1   r   r1   )r   rN   )rX   rY   rW   rZ   r   r   )rd   NNNF)re   rY   rf   rY   rg   rh   ri   rj   rk   rj   rl   rZ   r   r   )NNry   Trz   r{   r|   T)r}   rY   r~   r   r   rj   r   r   r   rZ   r   r   r   rj   r   rY   r   rZ   r+   r   r   r   )r   rY   r   r   )__name__
__module____qualname__r   rI   rM   propertyrT   rV   rb   classmethodrx   r   r   __classcell__)r,   s   @r-   r   r      s        ?C$(	N9 N9 N9 N9 N9 N9 N9`< < < <       X" " " " HL G G G G G G  #'#'#!&E E E E [E8  (,!"(,+;$ HZ HZ HZ HZ [HZT ` ` ` [` ` ` ` `r.   r   ) 
__future__r   r   loggingrQ   r_   pathlibr   typingr   r   r   typing_extensionsnumpyr   r!   safetensors.torchr   r]   
tokenizersr   r	   transformersr
   (sentence_transformers.models.InputModuler   sentence_transformers.utilr   	getLoggerr   r   r   r5   r.   r-   <module>r      s   " " " " " "    				            ' ' ' '&&&&&&&&'      @ @ @ @ @ @                   0 0 0 0 0 0 @ @ @ @ @ @ 6 6 6 6 6 6		8	$	$p` p` p` p` p`k p` p` p` p` p`s   + 99