
    bie!                    t    d dl mZ d dlmZ d dlZd dlmZmZ d dlmZ d dl	m
Z
  G d dej                  ZdS )	    )annotations)IterableN)Tensornn)util)SentenceTransformerc                  X     e Zd Zej        dfd fdZddZddZedd            Z	 xZ
S )DistillKLDivLossg      ?modelr   temperaturefloatreturnNonec                    t                                                       || _        || _        || _        t          j        d          | _        dS )a  
        Compute the KL divergence loss between probability distributions derived from student and teacher models' similarity scores.
        By default, similarity is calculated using the dot-product. This loss is designed for knowledge distillation
        where a smaller student model learns from a more powerful teacher model.

        The loss computes softmax probabilities from the teacher similarity scores and log-softmax probabilities
        from the student model, then calculates the KL divergence between these distributions.

        Args:
            model: SentenceTransformer model (student model)
            similarity_fct: Which similarity function to use for the student model
            temperature: Temperature parameter to soften probability distributions (higher temperature = softer distributions)
                A temperature of 1.0 does not scale the scores. Note: in the v5.0.1 release, the default temperature was changed from 2.0 to 1.0.

        References:
            - For more details, please refer to https://huggingface.co/papers/2010.11386

        Requirements:
            1. (query, positive, negative_1, ..., negative_n) examples
            2. Labels containing teacher model's scores between query-positive and query-negative pairs

        Inputs:
            +------------------------------------------------+------------------------------------------------------------+
            | Texts                                          | Labels                                                     |
            +================================================+============================================================+
            | (query, positive, negative)                    | [Teacher(query, positive), Teacher(query, negative)]       |
            +------------------------------------------------+------------------------------------------------------------+
            | (query, positive, negative_1, ..., negative_n) | [Teacher(query, positive), Teacher(query, negative_i)...]  |
            +------------------------------------------------+------------------------------------------------------------+

        Relations:
            - Similar to :class:`~sentence_transformers.losses.MarginMSELoss` but uses KL divergence instead of MSE
            - More suited for distillation tasks where preserving ranking is important

        Example:

            Using a teacher model to compute similarity scores for distillation:

            ::

                from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses
                from datasets import Dataset
                import torch

                student_model = SentenceTransformer("microsoft/mpnet-base")
                teacher_model = SentenceTransformer("all-mpnet-base-v2")
                train_dataset = Dataset.from_dict({
                    "query": ["It's nice weather outside today.", "He drove to work."],
                    "positive": ["It's so sunny.", "He took the car to work."],
                    "negative": ["It's very cold.", "She walked to the store."],
                })

                def compute_labels(batch):
                    emb_queries = teacher_model.encode(batch["query"])
                    emb_positives = teacher_model.encode(batch["positive"])
                    emb_negatives = teacher_model.encode(batch["negative"])

                    pos_scores = teacher_model.similarity_pairwise(emb_queries, emb_positives)
                    neg_scores = teacher_model.similarity_pairwise(emb_queries, emb_negatives)

                    # Stack the scores for positive and negative pairs
                    return {
                        "label": torch.stack([pos_scores, neg_scores], dim=1)
                    }

                train_dataset = train_dataset.map(compute_labels, batched=True)
                loss = losses.DistillKLDivLoss(student_model)

                trainer = SentenceTransformerTrainer(model=student_model, train_dataset=train_dataset, loss=loss)
                trainer.train()

            With multiple negatives:

            ::

                from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses
                from datasets import Dataset
                import torch

                student_model = SentenceTransformer("microsoft/mpnet-base")
                teacher_model = SentenceTransformer("all-mpnet-base-v2")

                train_dataset = Dataset.from_dict(
                    {
                        "query": ["It's nice weather outside today.", "He drove to work."],
                        "positive": ["It's so sunny.", "He took the car to work."],
                        "negative1": ["It's very cold.", "She walked to the store."],
                        "negative2": ["Its rainy", "She took the bus"],
                    }
                )


                def compute_labels(batch):
                    emb_queries = teacher_model.encode(batch["query"])
                    emb_positives = teacher_model.encode(batch["positive"])
                    emb_negatives1 = teacher_model.encode(batch["negative1"])
                    emb_negatives2 = teacher_model.encode(batch["negative2"])

                    pos_scores = teacher_model.similarity_pairwise(emb_queries, emb_positives)
                    neg_scores1 = teacher_model.similarity_pairwise(emb_queries, emb_negatives1)
                    neg_scores2 = teacher_model.similarity_pairwise(emb_queries, emb_negatives2)

                    # Stack the scores for positive and multiple negative pairs
                    return {
                        "label": torch.stack([pos_scores, neg_scores1, neg_scores2], dim=1)
                    }

                train_dataset = train_dataset.map(compute_labels, batched=True)
                loss = losses.DistillKLDivLoss(student_model)

                trainer = SentenceTransformerTrainer(model=student_model, train_dataset=train_dataset, loss=loss)
                trainer.train()
        	batchmean)	reductionN)super__init__r   similarity_fctr   r   	KLDivLossloss_fct)selfr   r   r   	__class__s       h/root/projects/butler/venv/lib/python3.11/site-packages/sentence_transformers/losses/DistillKLDivLoss.pyr   zDistillKLDivLoss.__init__   sK    h 	
,&{;;;    sentence_featuresIterable[dict[str, Tensor]]labelsr   c                L      fd|D             }                      ||          S )Nc                F    g | ]}                     |          d          S )sentence_embedding)r   ).0sentence_featurer   s     r   
<listcomp>z,DistillKLDivLoss.forward.<locals>.<listcomp>   s-    sssM]djj!1223GHsssr   )compute_loss_from_embeddings)r   r   r   
embeddingss   `   r   forwardzDistillKLDivLoss.forward   s3    ssssarsss
00VDDDr   r&   list[Tensor]c                6    |d         t          j         fd|dd          D             d          }| j        z  }t          j        |d          }| j        z  }t          j        |d          }                     ||          }| j        dz  z  }|S )Nr   c                <    g | ]}                     |          S  )r   )r"   embeddings_otherembeddings_queryr   s     r   r$   zADistillKLDivLoss.compute_loss_from_embeddings.<locals>.<listcomp>   s,    lllIYT  !13CDDlllr      )dim   )torchstackr   log_softmaxsoftmaxr   )	r   r&   r   student_scoresstudent_log_probsteacher_scoresteacher_probslossr-   s	   `       @r   r%   z-DistillKLDivLoss.compute_loss_from_embeddings   s    %a= lllll]ghihjhj]klll
 
 

 ($*::!-n!DDD  $"22n!<<< }}.>>t'*+r   strc                    dS )Nai  
@misc{lin2020distillingdenserepresentationsranking,
      title={Distilling Dense Representations for Ranking using Tightly-Coupled Teachers},
      author={Sheng-Chieh Lin and Jheng-Hong Yang and Jimmy Lin},
      year={2020},
      eprint={2010.11386},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2010.11386},
}
r+   )r   s    r   citationzDistillKLDivLoss.citation   s    
 
r   )r   r   r   r   r   r   )r   r   r   r   r   r   )r&   r(   r   r   r   r   )r   r:   )__name__
__module____qualname__r   pairwise_dot_scorer   r'   r%   propertyr<   __classcell__)r   s   @r   r
   r
      s        9=9Pgjx< x< x< x< x< x< x<tE E E E
   ,    X    r   r
   )
__future__r   collections.abcr   r1   r   r   sentence_transformersr   )sentence_transformers.SentenceTransformerr   Moduler
   r+   r   r   <module>rH      s    " " " " " " $ $ $ $ $ $          & & & & & & I I I I I Ib b b b bry b b b b br   