
    biC              
         d dl mZ d dlZd dlZd dlZd dlmZ d dlmZ d dl	Z
d dlZd dlmZ d dlmZ ddlmZ dd	lmZ  ej        e          Zerd d
lmZ ddddddedddf
d8d&Zddddefd9d)Zd:d+Zddd,efd;d/Z	 	 	 	 d<d=d7ZdS )>    )annotationsN)Callable)TYPE_CHECKING)Tensor)tqdm   )cos_sim)normalize_embeddings)SentenceTransformerF    i  i i  d   modelr   	sentences	list[str]show_progress_barbool
batch_sizeintquery_chunk_sizecorpus_chunk_size	max_pairstop_kscore_function"Callable[[Tensor, Tensor], Tensor]truncate_dim
int | Noneprompt_name
str | Nonepromptreturnlist[list[float | int]]c           	     d    |                      |||d|	|
|          }t          ||||||          S )a@	  
    Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all
    other sentences and returns a list with the pairs that have the highest cosine similarity score.

    Args:
        model (SentenceTransformer): SentenceTransformer model for embedding computation
        sentences (List[str]): A list of strings (texts or sentences)
        show_progress_bar (bool, optional): Plotting of a progress bar. Defaults to False.
        batch_size (int, optional): Number of texts that are encoded simultaneously by the model. Defaults to 32.
        query_chunk_size (int, optional): Search for most similar pairs for #query_chunk_size at the same time. Decrease, to lower memory footprint (increases run-time). Defaults to 5000.
        corpus_chunk_size (int, optional): Compare a sentence simultaneously against #corpus_chunk_size other sentences. Decrease, to lower memory footprint (increases run-time). Defaults to 100000.
        max_pairs (int, optional): Maximal number of text pairs returned. Defaults to 500000.
        top_k (int, optional): For each sentence, we retrieve up to top_k other sentences. Defaults to 100.
        score_function (Callable[[Tensor, Tensor], Tensor], optional): Function for computing scores. By default, cosine similarity. Defaults to cos_sim.
        truncate_dim (int, optional): The dimension to truncate sentence embeddings to. If None, uses the model's ones. Defaults to None.
        prompt_name (Optional[str], optional): The name of a predefined prompt to use when encoding the sentence.
            It must match a key in the model `prompts` dictionary, which can be set during model initialization
            or loaded from the model configuration.

            Ignored if `prompt` is provided. Defaults to None.

        prompt (Optional[str], optional): A raw prompt string to prepend directly to the input sentence during encoding.

            For instance, `prompt="query: "` transforms the sentence "What is the capital of France?" into:
            "query: What is the capital of France?". Use this to override the prompt logic entirely and supply your own prefix.
            This takes precedence over `prompt_name`. Defaults to None.

    Returns:
        List[List[Union[float, int]]]: Returns a list of triplets with the format [score, id1, id2]
    T)r   r   convert_to_tensorr   r   r   )r   r   r   r   r   )encodeparaphrase_mining_embeddings)r   r   r   r   r   r   r   r   r   r   r   r   
embeddingss                _/root/projects/butler/venv/lib/python3.11/site-packages/sentence_transformers/util/retrieval.pyparaphrase_miningr(      s]    \ +!   J ()+%       r&   r   c                   |dz  }t          j                    }d}d}t          dt          |           |          D ]x}	t          dt          |           |          D ]U}
 || |
|
|z            | |	|	|z                      }t	          j        |t          |t          |d                             ddd          \  }}|                                                                }|                                                                }t          t          |                    D ]}t          ||                   D ]r\  }}|
|z   }|	|z   }||k    r]||         |         |k    rK|
                    ||         |         ||f           |dz  }||k    r|                                }|d         }sWzt                      }g }|                                s{|                                \  }}}t          ||g          \  }}||k    r5||f|vr/|                    ||f           |                    |||g           |                                {t          |d d          }|S )	a  
    Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all
    other sentences and returns a list with the pairs that have the highest cosine similarity score.

    Args:
        embeddings (Tensor): A tensor with the embeddings
        query_chunk_size (int): Search for most similar pairs for #query_chunk_size at the same time. Decrease, to lower memory footprint (increases run-time).
        corpus_chunk_size (int): Compare a sentence simultaneously against #corpus_chunk_size other sentences. Decrease, to lower memory footprint (increases run-time).
        max_pairs (int): Maximal number of text pairs returned.
        top_k (int): For each sentence, we retrieve up to top_k other sentences
        score_function (Callable[[Tensor, Tensor], Tensor]): Function for computing scores. By default, cosine similarity.

    Returns:
        List[List[Union[float, int]]]: Returns a list of triplets with the format [score, id1, id2]
    r   r   TFdimlargestsortedc                    | d         S )Nr    xs    r'   <lambda>z.paraphrase_mining_embeddings.<locals>.<lambda>   s
    !A$ r)   keyreverse)queuePriorityQueuerangelentorchtopkmincputolist	enumerateputgetsetemptyr/   addappend)r&   r   r   r   r   r   pairs	min_score	num_addedcorpus_start_idxquery_start_idxscoresscores_top_k_valuesscores_top_k_idx	query_itr	top_k_idx
corpus_itrijentryadded_pairs
pairs_listscoresorted_isorted_js                            r'   r%   r%   Y   s   0 
QJE !!EII!!S__6GHH 1 1$QJ9IJJ 	1 	1O#^?_?O-OOP+.>AR.RRS F
 5:JE3vay>>224PU5 5 51!1 #6"9"9";";"B"B"D"D/3355<<>>"3v;;// 1 1	-67G	7R-S-S 
1 
1)Iz')3A(:5AAvv"5i"@"Ki"W"W		#6y#A)#LaQR"STTT!Q	$	11$)IIKKE(-aI
11	14 %%KJkkmm ;iikkq!#QF^^(xXx$8$K$KOOXx0111uh9::: kkmm ; 
EEEJr)   "list[list[dict[str, int | float]]]c                     t          | i |S )z8This function is deprecated. Use semantic_search instead)semantic_search)argskwargss     r'   information_retrievalr`      s    D+F+++r)   
   query_embeddingscorpus_embeddingsc                   t          | t          j        t          j        f          rt	          j        |           } n)t          | t                    rt	          j        |           } t          | j	                  dk    r| 
                    d          } t          |t          j        t          j        f          rt	          j        |          }n)t          |t                    rt	          j        |          }|j        | j        k    r|                     |j                  } d t          t          |                     D             }t          dt          |           |          D ]"}t          ||z   t          |                     }| j        r3t	          j        ||| j                  }	|                     d|	          }
n
| ||         }
t          dt          |          |          D ]}t          ||z   t          |                    }|j        r3t	          j        |||j                  }	|                    d|	          }n
|||         } ||
|          }t	          j        |t          |t          |d                             ddd          \  }}|                                                                }|                                                                }t          t          |                    D ]}t+          ||         ||                   D ]c\  }}||z   }||z   }t          ||                   |k     rt-          j        ||         ||f           Ft-          j        ||         ||f           d$t          t          |                    D ]b}t          t          ||                             D ]!}||         |         \  }}||d||         |<   "t3          ||         d	 d
          ||<   c|S )a3  
    This function performs by default a cosine similarity search between a list of query embeddings  and a list of corpus embeddings.
    It can be used for Information Retrieval / Semantic Search for corpora up to about 1 Million entries.

    Args:
        query_embeddings (:class:`~torch.Tensor`): A 2 dimensional tensor with the query embeddings. Can be a sparse tensor.
        corpus_embeddings (:class:`~torch.Tensor`): A 2 dimensional tensor with the corpus embeddings. Can be a sparse tensor.
        query_chunk_size (int, optional): Process 100 queries simultaneously. Increasing that value increases the speed, but requires more memory. Defaults to 100.
        corpus_chunk_size (int, optional): Scans the corpus 100k entries at a time. Increasing that value increases the speed, but requires more memory. Defaults to 500000.
        top_k (int, optional): Retrieve top k matching entries. Defaults to 10.
        score_function (Callable[[:class:`~torch.Tensor`, :class:`~torch.Tensor`], :class:`~torch.Tensor`], optional): Function for computing scores. By default, cosine similarity.

    Returns:
        List[List[Dict[str, Union[int, float]]]]: A list with one entry for each query. Each entry is a list of dictionaries with the keys 'corpus_id' and 'score', sorted by decreasing cosine similarity scores.
    r   r   c                    g | ]}g S r1   r1   ).0_s     r'   
<listcomp>z#semantic_search.<locals>.<listcomp>   s    DDD!2DDDr)   deviceTFr,   )	corpus_idrX   c                    | d         S )NrX   r1   r2   s    r'   r4   z!semantic_search.<locals>.<lambda>   s    \]^e\f r)   r5   )
isinstancenpndarraygenericr<   
from_numpyliststackr;   shape	unsqueezerj   tor:   r>   	is_sparsearangeindex_selectr=   r?   r@   zipheapqheappushheappushpopr/   )rb   rc   r   r   r   r   queries_result_listrL   query_end_idxindicesquery_chunkrK   corpus_end_idxcorpus_chunk
cos_scorescos_scores_top_k_valuescos_scores_top_k_idxrP   sub_corpus_idrX   rk   query_iddoc_itrs                          r'   r]   r]      se   0 "RZ$<== 9 +,<==	$d	+	+ 9 ;'788
!""a''+55a88#bj"*%=>> ;!,->??	%t	,	, ;!K(9:: #3#:::+../@/GHHDDuS1A-B-B'C'CDDD C(8$9$9;KLL $] $]O.>>DT@U@UVV% 	Jl?MJZJabbbG*777CCKK*?=+HIK !&a->)?)?AR S S 	] 	] !14E!EsK\G]G]^^N * R,'7PaPhiii0==aII01A.1PQ (\BBJ =BJCs:a='9'9::4X]= = =9#%9 '>&A&A&C&C&J&J&L&L##7#;#;#=#=#D#D#F#F "3z??33 	] 	]	,/0DY0OQhirQs,t,t ] ](M5 0= @I.:H.x899EAA/9E9;M    )*=h*G%QZI[\\\\]	]%	]< #12233 v vS!4X!>??@@ 	^ 	^G28<WEE9CLW\5]5])'22(./B8/LRfRfpt(u(u(uH%%r)         ?   torch.Tensor | np.ndarray	thresholdfloatmin_community_sizelist[list[int]]c                   t          | t          j                  st          j        |           } t          j        || j                  }t          |           } g }t          |t          |                     }t          t          d|z  d          t          |                     }t          t          dt          |           |          d|           D ]}| |||z            | j        z  }| j        j        dv r||k    }	|	                    d          }
|
|k    }|                                s\|
|         }
||         }|
                                }|                    |d	
          \  }}t!          |
|          D ]4\  }}|                    |d|                                                    5|                    |d	
          \  }}t          t          |                    D ]}||         d         |k    r||                             |d	
          \  }}|d         |k    rr|t          |           k     r_t          d|z  t          |                     }||                             |d	
          \  }}|d         |k    r|t          |           k     _|                    |||k                                                        t'          |d d	          }g }t)                      }t+          |          D ]b\  }}g }|D ]}||vr|                    |           t          |          |k    r*|                    |           |                    |           ct'          |d d	          }|S )a  
    Function for Fast Community Detection.

    Finds in the embeddings all communities, i.e. embeddings that are close (closer than threshold).
    Returns only communities that are larger than min_community_size. The communities are returned
    in decreasing order. The first element in each list is the central point in the community.

    Args:
        embeddings (torch.Tensor or numpy.ndarray): The input embeddings.
        threshold (float): The threshold for determining if two embeddings are close. Defaults to 0.75.
        min_community_size (int): The minimum size of a community to be considered. Defaults to 10.
        batch_size (int): The batch size for computing cosine similarity scores. Defaults to 1024.
        show_progress_bar (bool): Whether to show a progress bar during computation. Defaults to False.

    Returns:
        List[List[int]]: A list of communities, where each community is represented as a list of indices.
    ri      2   r   zFinding clusters)descdisable)cudanpur   T)kr.   Nr+   c                     t          |           S Nr;   r2   s    r'   r4   z%community_detection.<locals>.<lambda>S  s    A r)   r5   c                     t          |           S r   r   r2   s    r'   r4   z%community_detection.<locals>.<lambda>c  s    #a&& r)   )rm   r<   r   tensorrj   r
   r>   r;   maxr   r:   Ttypesumanyr=   rz   rG   r@   r/   rD   rA   update)r&   r   r   r   r   extracted_communitiessort_max_size	start_idxr   threshold_maskrow_wise_countlarge_enough_maskr   rg   top_k_indicescountr   top_k_valuesrS   top_val_largetop_idx_largeunique_communitiesextracted_ids
cluster_id	communitynon_overlapped_communityidxs                              r'   community_detectionr     s   0 j%,// .\*--
Yz/@AAAI%j11J /ZAAA 22B77ZIIMaZ*--4FTePe   *e *e	  	I
,B BCjlR
 !_44'94N+//22N !/2D D$((** +,=>N#$56J ""$$A)DAAA} #&nm"D"D G Gw%,,WVeV_-C-C-E-EFFFFG )oo0BDoQQOL! 3|,,-- 
e 
e?2&)333=a=3E3E_c3E3d3d0M= (+i77MCPZOO<[<[(+A,=s:(O(O7A!}7I7IMcg7I7h7h4} (+i77MCPZOO<[<[ *00}PY?Y1Z1a1a1c1cddd
e ##8>N>NX\]]] EEM!*+@!A!A ; ;
I#%  	5 	5C-''(//444'((,>>>%%&>???  !9::: 28H8HRVWWWr)   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   )r&   r   r   r   r   r   r   r   r   r   r   r   r    r!   )r    r[   )rb   r   rc   r   r   r   r   r   r   r   r   r   r    r[   )r   ra   r   F)r&   r   r   r   r   r   r   r   r   r   r    r   )
__future__r   r{   loggingr8   collections.abcr   typingr   numpyrn   r<   r   tqdm.autonotebookr   
similarityr	   r   r
   	getLogger__name__logger)sentence_transformers.SentenceTransformerr   r(   r%   r`   r]   r   r1   r)   r'   <module>r      s   " " " " " "    $ $ $ $ $ $                        " " " " " "       ( ( ( ( ( (		8	$	$ NMMMMMM $ #9@#"? ? ? ? ?H !#9@F F F F FR, , , ,  #9@X X X X Xz  #c c c c c c cr)   