
    bieB                        d dl mZ d dlmZ ddlmZ ddlmZ ddlm	Z	  e	            rd dl
Z
 ej        e          Z	 	 d d	e
j        d
e
j        de
j        dz  dede
j        f
dZde
j        j        de
j        de
j        de
j        de
j        f
dZ	 d!d	e
j        d
e
j        de
j        dz  de
j        fdZ	 	 	 d"d	e
j        d
e
j        de
j        dz  de
j        dz  dede
j        fdZde
j        j        de
j        de
j        de
j        de
j        f
dZ G d de          Z e            Zde
j        de
j        fdZ	 d!ddddee
j        j                 dz  dededee
j        j                 fdZdS )#    )Callable)wraps   )logging)GeneralInterface)is_torch_availableNFinputweightbiasis_transposedreturnc                    |r<t          j        |                     d          |                              d          }n;t          j        ||                     d                                        d          }|||z   }|S )a  Batched linear layer supporting optional bias and transposed weights.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (batch_size, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (batch_size, output_dim, input_dim) if transposed is `False`,
            else of shape (batch_size, input_dim, output_dim).
        bias (`torch.Tensor`, *optional*):
            Bias tensor of shape (batch_size, output_dim). Default is `None`.
        is_transposed (`bool`, *optional*, defaults to `False`):
            Whether the weight tensor is transposed.
    Returns:
        `torch.Tensor`: Output tensor of shape (batch_size, output_dim).
       )torchbmm	unsqueezesqueeze)r	   r
   r   r   outs        X/root/projects/butler/venv/lib/python3.11/site-packages/transformers/integrations/moe.py_batched_linearr   D   s{    *  Ai**F33;;A>> i 3 344<<R@@DjJ    selfhidden_statestop_k_indextop_k_weightsc                 `   |j         }|                    d          }|                    d          }|                    d          }t          j        ||                              d                              d|                              d          }|                    d          }	|                    d          }
|
| j        k     }|
                    d| j        dz
            }||         }| j	        |         }| j
        |         }| j        r| j        |         nd }| j        r| j        |         nd }t          |||| j                  }|                     |          }t          |||| j                  }|	j        |j        k    r|	                    d|          }	||	                    d          z  }||                    d                              |j                  z  }|                    |||                              d          }|                    |j                  S )Nr   r   devicer   r   dim)r   sizer   aranger   expandreshapenum_expertsclampgate_up_proj	down_projhas_biasgate_up_proj_biasdown_proj_biasr   r   _apply_gateshapegathertodtypeviewsum)r   r   r   r   r   	num_top_k
num_tokens
hidden_dim	token_idxsample_weights
expert_ids
valid_maskexpert_ids_clampedselected_hidden_statesselected_gate_upselected_downselected_gate_up_biasselected_down_biasgate_up_out	gated_outout_per_samplefinal_hidden_statess                         r   batched_mm_experts_forwardrF   f   sJ    !F  $$I##A&&J##B''J Z777AA!DDKKBPYZZbbceffI"**2..N$$R((J d..J#))!T-=-ABB +95 ();<N#56MJN-aD23EFF]aDHM[,-?@@W[ " 02GW[Wi  K
   --I %="4DDV  N
 1777'..q2DEE#n&>&>r&B&BBN#j&:&:2&>&>&A&A.BV&W&WWN )--j)ZPPTTYZT[[!!-"5666r   offsc                 d   t          t          j        j        d          r?t          j        j                            |                     |j                  ||          S t          t          d          r/t          j        |                     |j                  ||          S t          d          )a  Grouped matrix multiplication dispatcher that uses torch.nn.functional.grouped_mm if available, else falls back to torch._grouped_mm.
    Args:
        input (`torch.Tensor`):
            Input tensor of shape (S, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (num_experts, output_dim, input_dim).
        offs (`torch.Tensor`, *optional*):
            Offsets tensor indicating the boundaries of each group in the input tensor.
    Returns:
        `torch.Tensor`: Output tensor of shape (S, output_dim).
    Raises:
        ImportError: If neither `torch.nn.functional.grouped_mm` nor `torch._grouped_mm` is available, indicating that the PyTorch version is incompatible.
    
grouped_mmrG   _grouped_mmzNeither torch.nn.functional.grouped_mm nor torch._grouped_mm is available. Please make sure you are using a PyTorch version that includes grouped_mm (2.9+).)	hasattrr   nn
functionalrI   r1   r2   rK   ImportError)r	   r
   rG   s      r   rK   rK      s    , ux"L11 
x"--ehhv|.D.DfSW-XXX		&	& 
 &,!7!7dKKKK`
 
 	
r   c                     |rt          | ||          }n&t          | |                    dd          |          }|||z   }|S )a*  Grouped linear layer supporting optional bias and transposed weights.

    Args:
        input (`torch.Tensor`):
            Input tensor of shape (S, input_dim).
        weight (`torch.Tensor`):
            Weight tensor of shape (num_experts, output_dim, input_dim) if transposed is `False`,
            else of shape (num_experts, input_dim, output_dim).
        bias (`torch.Tensor`, *optional*):
            Bias tensor of shape (num_experts, output_dim). Default is `None`.
        offs (`torch.Tensor`, *optional*):
            Offsets tensor indicating the boundaries of each group in the input tensor.
        is_transposed (`bool`, *optional*, defaults to `False`):
            Whether the weight tensor is transposed.
    Returns:
        `torch.Tensor`: Output tensor of shape (S, output_dim).
    rJ   r   )rK   	transpose)r	   r
   r   rG   r   r   s         r   _grouped_linearrS      s]    0  F%d333 %!1!1"b!9!9EEEDjJr   c                    |j         }|                    d          }|                    d          }|                    d          }t          j        ||                              d                              d|                              d          }|                    d          }	|                    d          }
||         }t          j        |
          }t          j        |          }|
|         }|	|         }||         }| j        }| j	        }| j
        r| j        |         nd }| j
        r| j        |         nd }|j        dk    r|                                n|                                }t          j        || j        d| j        dz
            }t          j        |dt          j                  }t)          ||||| j                  }|                     |          }t)          ||||| j                  }||                    d          z  }||         }|                    |||                              d	          }|                    |j                  S )
Nr   r   r   r   cpu)binsminmax)r"   r2   r    r!   )r   r#   r   r$   r   r%   r&   argsortr)   r*   r+   r,   r-   typefloatinthistcr'   cumsumint32rS   r   r.   r3   r4   r1   r2   )r   r   r   r   r   r5   r6   r7   r8   r9   r:   r=   perminv_permexpert_ids_gsample_weights_gselected_hidden_states_gr>   r?   r@   rA   histc_inputnum_tokens_per_expertoffsetsrB   rC   out_per_sample_grD   rE   s                                r   grouped_mm_experts_forwardri      s    !F  $$I##A&&J##B''J Z777AA!DDKKBPYZZbbceffI"**2..N$$R((J +95 =$$D}T""Hd#L%d+5d; (NMDHM[D2<@@W[>BmU,\::QU
 +1+*>*>,$$&&&LDTDTDVDVK!K$:JPQW[WgjkWkllll0au{KKKG " "24I7bfbt  K
   --I '="4gTM_  
 (*:*D*DR*H*HH &h/N )--j)ZPPTTYZT[[!!-"5666r   c                   :     e Zd ZdZeedZdededef fdZ	 xZ
S )ExpertsInterfacez9Interface for registering custom experts implementations.)
batched_mmrI   experts_implementationdefaultr   c                     |t                               d           n|dk    r|| vrt          d| d          t                                          ||          S )zfReturn the requested `experts_implementation`. Also strictly check its validity, and raise if invalid.Na
  You tried to access the `ExpertsInterface` with a `config._experts_implementation` set to `None`. This is expected if you use an Expert Module as a standalone Module. If this is not the case, something went wrong with the dispatch of `config._experts_implementation`eager`zL` is not a valid experts implementation registered in the `ExpertsInterface`)loggerwarning_onceKeyErrorsuperget)r   rm   rn   	__class__s      r   get_interfacezExpertsInterface.get_interface4  s    !)N   
 $w..3IQU3U3Ux*xxx   ww{{17;;;r   )__name__
__module____qualname____doc__rF   ri   _global_mappingstrr   rx   __classcell__)rw   s   @r   rk   rk   ,  sl        CC 10 O
<C <( <x < < < < < < < < < <r   rk   rB   c                 f    |                     dd          \  }}|                     |          |z  S )a  
    Default gating mechanism: splits the gate_up_out into gate and up parts,
    applies the activation function to the gate part, and multiplies it with the up part.
    Args:
        gate_up_out (`torch.Tensor`):
            The output tensor from the gate and up projection of shape (S, 2 * intermediate_dim).
    Returns:
        `torch.Tensor`: The gated output tensor of shape (S, intermediate_dim).
    r   r   r!   )chunkact_fn)r   rB   gateups       r   _default_apply_gater   F  s7        ++HD";;tr!!r   )r   r+   experts_classr+   c                    dt           t          j        j                 dt           t          j        j                 ffd}|  ||           S |S )aV  Decorator to modify experts class to support different experts implementations.

    Args:
        experts_class (`type[torch.nn.Module]`, *optional*):
            The experts class to modify. If not provided, returns a decorator that can be applied to the class.
        is_transposed (`bool`, *optional*, defaults to `False`):
            Whether the expert weights are stored in transposed format.
        has_bias (`bool`, *optional*, defaults to `False`):
            Whether the expert layers include bias terms.

    Returns:
        `type[torch.nn.Module]`: The modified experts class.
    r   r   c                     | j         | j        t                    fd            }t                    fd            }t          | d          st          | _        || _         || _        | S )Nc                 L     | |g|R i | || _         | _        | _        d S N)configr+   r   )r   r   argskwargsr+   r   original_inits       r   __init__z=use_experts_implementation.<locals>.wrapper.<locals>.__init__i  sA    M$8888888 DK$DM!.Dr   c                 f    t                               | j        j                  } || g|R i |S r   )ALL_EXPERTS_FUNCTIONSrx   r   _experts_implementation)r   r   r   experts_forwardoriginal_forwards       r   forwardz<use_experts_implementation.<locals>.wrapper.<locals>.forwardp  sE    3AA35E O #?49$999&999r   r.   )r   r   r   rL   r   r.   )r   r   r   r   r   r+   r   s      @@r   wrapperz+use_experts_implementation.<locals>.wrappere  s    %.(0	}			/ 	/ 	/ 	/ 	/ 	/ 
		/ 
	 	 	: 	: 	: 	: 
!	 	: }m44 	<(;M%!) 'r   )rZ   r   rM   Module)r   r   r+   r   s    `` r   use_experts_implementationr   T  sd    "tEHO4 eho9N       0  w}%%%Nr   )NFr   )NNF)collections.abcr   	functoolsr   utilsr   utils.genericr   utils.import_utilsr   r   
get_loggerry   rr   Tensorboolr   rM   r   rF   rK   rS   ri   rk   r   r   rZ   r    r   r   <module>r      s&   % $ $ $ $ $             , , , , , , 3 3 3 3 3 3  LLL		H	%	%Z !%	 <L ,
 	
 \   D67
(/67<67 67 <	67
 \67 67 67 67x !%
 
<
L
 ,

 \	
 
 
 
H !% $# #<#L# ,
# ,
	#
 # \# # # #LC7
(/C7<C7 C7 <	C7
 \C7 C7 C7 C7L< < < < <' < < <. )(** "5< "EL " " " " 37,QVin, , ,(4/,JN,bf,	%(/, , , , , ,r   