
    biX                        d dl mZ d dlZd dlmZ d dlZd dlmZ d dlmZ ddl	m
Z
  ej        e          Z G d d	e          ZdS )
    )annotationsN)Literal)Tensor)InputModule   )WhitespaceTokenizerc                  ~     e Zd ZU dZdZded<   g dZded<   i dd	fd& fdZd'dZd(dZ	d Z
	 d)d*dZd	d d+d%Z xZS ),BoWzImplements a Bag-of-Words (BoW) model to derive sentence embeddings.

    A weighting can be added to allow the generation of tf-idf vectors. The output vector has the size of the vocab.
    Fboolsave_in_root)vocabword_weightsunknown_word_weightcumulative_term_frequency	list[str]config_keysr   Tr   r   dict[str, float]r   floatr   c                t   t                                                       t          t                              |                    }|| _        || _        || _        || _        g | _	        d}|D ]a}|}||v r	||         }n6|
                                |v r||
                                         }n|dz  }| j	                            |           bt                              | dt          |           d|            t          |t!                      d          | _        t          |          | _        d S )Nr   r   z out of z0 words without a weighting value. Set weight to F)
stop_wordsdo_lower_case)super__init__listdictfromkeysr   r   r   r   weightslowerappendloggerinfolenr   set	tokenizersentence_embedding_dimension)	selfr   r   r   r   num_unknown_wordswordweight	__class__s	           [/root/projects/butler/venv/lib/python3.11/site-packages/sentence_transformers/models/BoW.pyr   zBoW.__init__   s?    	T]]5))**
(#6 )B&  	( 	(D(F|##%d+--%djjll3!Q&!L'''' {{#e**{{fy{{	
 	
 	
 -UsuuTYZZZ,/JJ)))    featuresdict[str, Tensor]c                    |S N )r&   r-   s     r+   forwardzBoW.forward;   s    r,   textsreturn	list[int]c                N      fd|D             }                      |          S )Nc                6    g | ]} j         j        |fi S r1   )r$   tokenize).0textkwargsr&   s     r+   
<listcomp>z BoW.tokenize.<locals>.<listcomp>@   s1    OOO,T^,T<<V<<OOOr,   )get_sentence_features)r&   r3   r;   	tokenizeds   ` ` r+   r8   zBoW.tokenize?   s4    OOOOOOOO	)))444r,   c                    | j         S r0   )r%   )r&   s    r+    get_sentence_embedding_dimensionz$BoW.get_sentence_embedding_dimensionC   s    00r,   r   tokenized_textslist[list[int]]pad_seq_lengthint1dict[Literal['sentence_embedding'], torch.Tensor]c                :   g }|D ]}t          j        |                                 t           j                  }|D ]5}| j        r||xx         | j        |         z  cc<   %| j        |         ||<   6|                    |           dt          j        |          iS )N)dtypesentence_embedding)torchzerosr@   float32r   r   r   stack)r&   rA   rC   vectorstokensvectortokens          r+   r=   zBoW.get_sentence_featuresF   s     % 	# 	#F[!F!F!H!HPUP]^^^F 8 81 85MMMT\%%88MMMM$(L$7F5MMNN6""""$ek'&:&:;;r,   )safe_serializationoutput_pathstrrQ   Nonec               0    |                      |           d S r0   )save_config)r&   rR   rQ   argsr;   s        r+   savezBoW.saveV   s    %%%%%r,   )r   r   r   r   r   r   r   r   )r-   r.   )r3   r   r4   r5   )r   )rA   rB   rC   rD   r4   rE   )rR   rS   rQ   r   r4   rT   )__name__
__module____qualname____doc__r   __annotations__r   r   r2   r8   r@   r=   rX   __classcell__)r*   s   @r+   r
   r
      s          
 LjjjKjjjj
 *,%&*. 7  7  7  7  7  7  7D   5 5 5 51 1 1 GH< < < < <  HL & & & & & & & & & &r,   r
   )
__future__r   loggingtypingr   rI   r   (sentence_transformers.models.InputModuler   r$   r   	getLoggerrY   r    r
   r1   r,   r+   <module>rd      s    " " " " " "               @ @ @ @ @ @ * * * * * *		8	$	$G& G& G& G& G&+ G& G& G& G& G&r,   