
    bi7                       d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d dl	mZ d dlmZmZmZmZmZ d dlZd dlmZ d d	lmZ d d
l m Z  d dl!Z!d dl!m"Z"m#Z#m$Z$m%Z% d dl&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1 erd dl!m2Z2m3Z3m4Z4 ddl5m6Z6  e(            rd dl7Z7 e,            rd dl8m9Z9  e*            o e'            o e+            o	 e)            Z:e:rd dl;Z;d dl<m=Z=m>Z> d dl?m@Z@ d dlAmBZBmCZC d dlDmEZE d dlFmGZG d dlHmIZImJZJmKZK d dlLmMZM d dlNmOZOmPZPmQZQmRZR d dlNmMZS d dlTmUZU d dlVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZe d dlfmgZg d dlhmiZimjZjmkZk  G d d egd!"          Zl G d# d$eUd!"          Zm G d% d&eGd!"          Zn ejel          Zo ejem          Zp ejen          Zqh d'Zrh d(Zsh d)Zt e1ju        ev          Zwd*d+d,d-iZx eyexz                                          Z{d.Z|d/ Z}d0 Z~d1 Z G d2 d3ej                  Zd4ed5e$d6e$fd7Z G d8 d9          Z G d: d;          Z G d< d=          Zd>e_        evd?k    r e            ZdS dS )@    N)	GeneratorIterable)asynccontextmanager)	lru_cache)BytesIO)Thread)TYPE_CHECKING	AnnotatedOptional	TypedDictUnion)scan_cache_dir)DecodeStream)tqdm)AutoTokenizerBitsAndBytesConfigGenerationConfigPreTrainedTokenizerBase)is_fastapi_availableis_librosa_availableis_openai_availableis_pydantic_availableis_uvicorn_availableis_vision_available   )LogitsProcessorListTextIteratorStreamer)logging)PreTrainedModelPreTrainedTokenizerFastProcessorMixin)ContinuousBatchingManager)Image)FastAPIHTTPException)CORSMiddleware)JSONResponseStreamingResponse)Transcription)TranscriptionCreateParamsBase)ChatCompletionChatCompletionMessageChatCompletionMessageParam)Choice)ChatCompletionChunkChoiceDeltaChoiceDeltaToolCallChoiceDeltaToolCallFunction)CompletionCreateParamsStreaming)ResponseResponseCompletedEventResponseContentPartAddedEventResponseContentPartDoneEventResponseCreatedEventResponseErrorResponseErrorEventResponseFailedEventResponseInProgressEventResponseOutputItemAddedEventResponseOutputItemDoneEventResponseOutputMessageResponseOutputTextResponseTextDeltaEventResponseTextDoneEvent)ResponseCreateParamsStreaming)	BaseModelTypeAdapterValidationErrorc                       e Zd ZU dZeed<   dS ))TransformersResponseCreateParamsStreamingz
        OpenAI's ResponseCreateParamsStreaming with an additional field for the generation config (as a json string).
        generation_configN__name__
__module____qualname____doc__str__annotations__     Q/root/projects/butler/venv/lib/python3.11/site-packages/transformers/cli/serve.pyrH   rH   s   *         	 	 rR   rH   F)totalc                       e Zd ZU dZeed<   dS )+TransformersCompletionCreateParamsStreamingz
        OpenAI's CompletionCreateParamsStreaming with additional fields for the generation config (as a json string) and passing the request_id
        rI   NrJ   rQ   rR   rS   rW   rW   z   rT   rR   rW   c                   6    e Zd ZU dZeed<   eed<   dZeed<   dS )%TransformersTranscriptionCreateParamsz
        OpenAI's TranscriptionCreateParamsBase with an additional field for the generation config (as a json string).
        filerI   FstreamN)	rK   rL   rM   rN   bytesrP   rO   r[   boolrQ   rR   rS   rY   rY      sC         	 	 rR   rY   >   textuserstorepromptinclude	reasoning
background
truncationtool_choiceservice_tiertop_logprobsmax_tool_callsprevious_response_id>   nstopr_   audior`   logprobsmetadata	functions
modalities
predictionrf   rg   rh   function_callstream_optionsresponse_formatpresence_penaltyreasoning_effortweb_search_optionsparallel_tool_callsmax_completion_tokens>   ra   rb   languageru   chunking_strategytimestamp_granularitiesqwenz<tool_call>z</tool_call>)startendzx-request-idc                 8    dd l }|                    |            d S Nr   )torchmanual_seed)_seedr   s     rS   set_torch_seedr      s%    LLL	erR   c                  v    dd l } | j                                        r| j                                         d S d S r   )r   cudais_availableempty_cache)r   s    rS   reset_torch_cacher      sE    LLLz   !
     ! !rR   c                 4    dd l }|                    |           S r   )r   	ones_like)_input_tensorr   s     rS   torch_ones_liker      s    LLL??=)))rR   c                       e Zd ZdZdZdZdZdS )ModalityLLMVLMSTTTTSN)rK   rL   rM   r   r   r   r   rQ   rR   rS   r   r      s"        
C
C
C
CCCrR   r   reqmodel_generation_configreturnc                    |                      d          %t          di t          j        | d                   }nt	          j        |          } |j        di |}|                                D ]\  }}|t          |||           |                      d          t          | d                   |_
        |                      d          t          | d                   |_
        |                      d          t          | d                   |_        |                      d          | d         |_        |                      d          | d         |_        |                      d          :t          | d                   |_        t          | d                   d	k    rd
|_        |                      d          t          | d                   |_        |                      d          t%          | d                    |S )a  
    Creates a generation config from the parameters of the request. If a generation config is passed in the request,
    it will be used as a baseline for parameterization. Otherwise, we will use the model's default generation config.
    Other parameters in the request will be applied on top of the baseline.

    Args:
        req (`dict`):
            The request which may optionally contain generation parameters.
        model_generation_config (`GenerationConfig`):
            The model's default generation config.
        kwargs (`dict`):
            Additional parameters to set in the generation config.

    Returns:
        The prepared `GenerationConfig` object.
    rI   Nmax_output_tokens
max_tokensfrequency_penalty
logit_biasrl   temperatureg        Ftop_pseedrQ   )getr   jsonloadscopydeepcopyupdateitemssetattrintmax_new_tokensfloatrepetition_penaltysequence_biasstop_stringsr   	do_sampler   r   )r   r   kwargsrI   non_standard_kwargskvs          rS   !create_generation_config_from_reqr      s   . ww"##/,TTtz#>Q:R/S/STT M*ABB2+2<<V<<#))++ - -1=%q!,,, ww"##/+.s3F/G+H+H( ww|(+.s</@+A+A(
ww"##//4S9L5M/N/N,
ww|(*-l*;'
wwv"),V&
ww})(-c-.@(A(A%]#$$++*/'
www#"'G"5"5
wwv"s6{###rR   c                       e Zd ZdZd Zd ZdS )	ToolStatez7Lightweight class to keep track of the tool call state.c                 .    |                                   d S N)resetselfs    rS   __init__zToolState.__init__)  s    

rR   c                 >    d| _         d| _        d| _        d| _        dS )z>Reset the tool call state (assumes we're outside a tool call).Fr    N)inside_tool_callhas_tool_name_definedarg_nesting_levelbufferr   s    rS   r   zToolState.reset,  s%     %%*"!"rR   N)rK   rL   rM   rN   r   r   rQ   rR   rS   r   r   &  s8        AA      rR   r   c                   T    e Zd ZdZ	 ddddeded         dz  fdZd	 Zd
 Zd Z	d Z
dS )
TimedModelz
    A class that holds a PreTrainedModel instance and its associated processor.
    Automatically deletes the instances after a specified timeout.
    Nmodelr   timeout_seconds	processor)r!   r    c                     || _         t          |j                  | _        || _        || _        t          j        | j        | j                  | _	        | j	        
                                 d S r   )r   rO   name_or_path_name_or_pathr   r   	threadingTimertimeout_reached_timerr   )r   r   r   r   s       rS   r   zTimedModel.__init__:  s`     
 !344".od&:D<PQQrR   c                     | j                                          t          j        | j        | j                  | _         | j                                          dS )z2Reset the timer for the deletion of the instances.N)r   cancelr   r   r   r   r   r   s    rS   reset_timerzTimedModel.reset_timerG  sI    od&:D<PQQrR   c                     t          | d          rU| j        P| `| `d| _        d| _        t          j                     t                       | j                                         dS dS dS )z>Delete the wrapped model and processor and clean up resources.r   N)hasattrr   r   gccollectr   r   r   r   s    rS   delete_modelzTimedModel.delete_modelM  sx    4!! 	!dj&<
DJ!DNJLLL  K     	! 	!&<&<rR   c                     | j         dk    r@|                                  t                              | j         d| j          d           d S d S )Nr   z was removed from memory after z seconds of inactivity)r   r   loggerinfor   r   s    rS   r   zTimedModel.timeout_reached\  sd    !##KK%rrdFZrrr     $#rR   c                 6    t          | d           p| j        du S )z)Check if the instances have been deleted.r   N)r   r   r   s    rS   
is_deletedzTimedModel.is_deletedc  s!    4)))?TZ4-??rR   r   )rK   rL   rM   rN   r   r   r   r   r   r   r   rQ   rR   rS   r   r   4  s          PT	    DEL	     ! ! !  @ @ @ @ @rR   r   c            $          e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dXdee ej        d	
          f         dee ej        d
          f         deedz   ej        d
          f         dee ej        d
          f         deedz   ej        d
          f         deedz   ej        d
          f         dee ej        d
          f         dee ej        d
          f         dee ej        d
          f         dee ej        d
          f         deedz   ej        d
          f         dee ej        d 
          f         d!ee ej        d"
          f         d#eedz   ej        d$
          f         d%ee ej        d&d'(          f         d)df d*Z	d+ Z
d, Zd-ed.ed/ed0efd1Zd-efd2Zd-efd3Zd-efd4Z	 	 	 	 	 	 	 	 dYd6ed7edz  d8edz  d9edz  d:edz  d;ee         dz  d<edz  d=ed>         d)efd?Zed@eez  d)efdA            ZeedZdBedz  d)eeeef                  fdC                        ZdDed6ed)e e!z  fdEZ"edZd8dFd)e#fdG            Z$edHe#fdI            Z%dDed)e e!z  fdJZ&dDed)e'eddf         fdKZ(dDed)efdLZ)dDed)e'eddf         fdMZ*dDed)efdNZ+d)e,dz  fdOZ-dPed)efdQZ.dRefdSZ/dRed)e0dT         fdUZ1dRed)e0dV         fdWZ2dS )[ServeFautoN	localhost@  ,  r   continuous_batchingz8Whether to use continuous batching for chat completions.)helpdevicezgDevice to use for inference; will default to `auto` and place the model on an accelerator if available.dtypezOverride the default `torch.dtype` and load the model under this dtype. If `'auto'` is passed, the dtype will be automatically derived from the model's weights.trust_remote_codez2Whether to trust remote code when loading a model.attn_implementationzWhich attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`.quantizationzAWhich quantization method to use. choices: 'bnb-4bit', 'bnb-8bit'hostz$Interface the server will listen to.portzPort the server will listen to.model_timeoutz@Time in seconds after which a model will be removed from memory.	log_levelz8Logging level as a string. Example: 'info' or 'warning'.default_seedz1The default seed for torch, should be an integer.enable_corsztWhether to enable CORS. Some apps that make requests from external domains (e.g. Cursor) require CORS to be enabled.input_validationz+Whether to turn on strict input validation.force_modelzName of the model to be forced on all requests. This is useful for testing Apps that don't allow changing models in the request.non_blockingTz/Whether to run the server in a separate thread.)hiddenr   r   c                     t           st          d          | _        | _        | _        | _        | _        | _        | _        | _	        |	 _
        |
 _        | _        | _        | _        | _        | _        |t#          |           t%          j        d          }|                    t$          j        |
                                                    t%          j        d          }|                    t$          j        |
                                                    i  _        d  _        d  _        d  _        d  _         j
         j        rdnd _
         j        r6                      j                  }| _                             |           t<          dt>          f fd            }t?          |          } j        r<|                     tB          d	gd
d	gd	g           tD          #                    d           ddl$m%} |&                    d          d|dtN          f fd            }|&                    d          dtN          f fd            }|&                    d          d|f fd            }|(                    d          |)                    d           fd                        }|)                    d          d             }|*                    d          d|fd            }tW          j,        | j         j	         j                  }tW          j-        |           _.         j        r /                                 d S  j.        0                                 d S )NzaMissing dependencies for the serving CLI. Please install with `pip install transformers[serving]`transformersz+transformers.generation.continuous_batchingr   appc                   K   d W V  j                                         D ]}|                                 j        j                            dd           d S d S )NT   blocktimeout)loaded_modelsvaluesr   #running_continuous_batching_managerrl   )r   r   r   s     rS   lifespanz Serve.__init__.<locals>.lifespan  sz      EEEE+2244 % %""$$$$7C8==DRS=TTTTT DCrR   )r   *T)allow_originsallow_credentialsallow_methodsallow_headerszUCORS allow origin is set to `*`. This is not recommended for production environments.r   )Requestz/v1/chat/completionsrequestbodyc                                          |           j        r                     || j        j                  S                     |          S )Nr  ) validate_chat_completion_requestr   #continuous_batching_chat_completionstate
request_idgenerate_chat_completion)r  r  r   s     rS   chat_completionz'Serve.__init__.<locals>.chat_completion  sT    11$1???' ;??gmF^___44T:::rR   z/v1/responsesc                                          |            |                     dd          }|s$                    |           }t          |          S                     |           }t          |d          S )Nr	  r[   Ttext/event-stream
media_type)validate_response_requestr   generate_response_non_streamingr'   generate_responser(   )r  r[   response_objoutputr   s       rS   	responsesz!Serve.__init__.<locals>.responses  s}    **7*;;;[[400F 2#CCGLL#L111++G44F$V8KLLLLrR   z/v1/audio/transcriptionsc           
        K   |                                  4 d {V }t          |d                                          d {V |d                   }t                              d|d         j         d|d         j         d|d         j        dz  dd	           d d d           d {V  n# 1 d {V swxY w Y                       |
           	                    |          }t          |d          S )NrZ   r   )rZ   r   zReceived file: z; MIME type: z; size:    z.2fz KiBr	  r  r  )formrY   readr   debugfilenamecontent_typesizevalidate_transcription_requestgenerate_transcriptionr(   )r  r  parsed_requestr  r   s       rS   audio_transcriptionsz,Serve.__init__.<locals>.audio_transcriptions  s      ||~~ 	 	 	 	 	 	 	!F#F|0022222222w-" " "
 @d6l&; @ @$v,Jc @ @!&\.5?@ @ @  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ///GGG00@@F$V8KLLLLs   A?B//
B9<B9z
/v1/modelsc                  L    t          d                                 d          S )Nlist)objectdata)r'   get_gen_modelsr   s   rS   get_all_modelsz&Serve.__init__.<locals>.get_all_models  s'      64;N;N;P;P Q QRRRrR   z/healthc                  $    t          ddi          S )Nstatusok)r'   rQ   rR   rS   healthcheckz#Serve.__init__.<locals>.healthcheck  s    4 0111rR   httpc                    K   | j                             t                    pt          t	          j                              }|| j        _         ||            d {V }||j         t          <   |S r   )headersr   X_REQUEST_IDrO   uuiduuid4r  r  )r  	call_nextr  responses       rS   get_or_set_request_idz-Serve.__init__.<locals>.get_or_set_request_id#  sk       ,,\::Oc$*,,>O>OJ'1GM$&Yw////////H-7H\*OrR   )r   r   r   )1serve_dependencies_availableImportErrorr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   
get_loggersetLevel
log_levelslowerr   r   last_messageslast_kv_cache
last_modelprocess_model_nameload_model_and_processorr   r$   add_middlewarer&   r   warning_oncefastapir  postdictoptionsr   
middlewareuvicornConfigServerserverstart_serverrun)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   transformers_logger	cb_loggermodel_id_and_revisionr   r   r  r  r  r%  r+  r/  r8  configs   `                            rS   r   zServe.__init__k  sC   t , 	s  
 $7 
!2#6 (		*"(& 0&( #<((( &0@@$$W%7	8I8I%JKKK&'TUU	7-ioo.?.?@AAA 57UY0 "!%'+'7!@SD 	A$($;$;D<L$M$M!3DO))*?@@@		U 	U 	U 	U 	U 	U 
		U x(((  
	"e"&"e"e     g   	$#####	(	)	)	;W 	;D 	; 	; 	; 	; 	; 
*	)	; 
/	"	"		Mt 		M 		M 		M 		M 		M 
#	"		M 
,	-	-	M 	M 	M 	M 	M 	M 
.	-	M" 
\	"	"				S 	S 	S 	S 
	 
#	"	S 
			2 	2 
		2 
			 	 	 	 
 		 $)$)t~^^^nV,, 	KOOrR   c                 |      fd}t          j        |dd           _         j                                         d S )Nc                      t          j                     _        t          j         j                    j                             j                                                   d S r   )asyncionew_event_loop_loopset_event_looprun_until_completerN  server   s   rS   _runz Serve.start_server.<locals>._run4  sP     /11DJ"4:...J))$+*;*;*=*=>>>>>rR   zuvicorn-threadF)targetnamedaemon)r   r   _threadr   )r   r]  s   ` rS   rO  zServe.start_server3  sS    	? 	? 	? 	? 	? !'t:JSXYYYrR   c                    | j         st          d          | j                                         st          d          d| j        _        | j         r6| j                                         r| j                             d           d S d S d S )NzHThe server cannot be killed as it was not launched in a separate thread.zThe server is already killed.Tr   )r   )ra  
ValueErroris_aliverN  should_exitjoinr   s    rS   kill_serverzServe.kill_server=  s    | 	ighhh|$$&& 	><==="&< 	)DL1133 	)La(((((	) 	) 	) 	)rR   r  schema	validatorunused_fieldsc                 x   t                               d|            t          |                                          }|j        }||z
  }|r1t                               d|            t          dd|           | j        r	 |                    |           nd# t          $ rW}t                               d|
                                            t          d|
                                          d}~ww xY w||z  }	|	r3t                               d|	            t          dd|	           dS dS )a  
        Validates the request against the schema, and checks for unexpected keys.

        Args:
            request (`dict`):
                The request to validate.
            schema (`TypedDict`):
                The schema of the request to validate. It is a `TypedDict` definition.
            validator (`TypeAdapter`):
                The validator to use to validate the request. Built from `schema`.
            unused_fields (`set`):
                Fields accepted by `schema`, but not used in `transformers serve`.

        Raises:
            HTTPException: If the request is invalid or contains unexpected or unused fields.
        zValidating request: z Unexpected keys in the request: i  )status_codedetailzValidation error: NzUnused fields in the request: )r   r  setkeys__mutable_keys__errorr%   r   validate_pythonrF   errors)
r   r  rh  ri  rj  
input_keyspossible_keysunexpected_keyseunused_fields_in_requests
             rS   _validate_requestzServe._validate_requestH  s}   . 	5G55666 ((
/$}4 	nLLMOMMNNNC8l[j8l8lmmmm  	H))'2222" H H H>!((**>>???#AHHJJGGGGH
 (2M'A$' X>VXXYYY# #,gMe,g,g   	 	 s   B 
C=&AC88C=c                 V    |                      |t          t          t                     d S N)r  rh  ri  rj  )ry  rH   response_validatorUNUSED_RESPONSE_FIELDSr   r  s     rS   r  zServe.validate_response_requesty  s5    <(0	 	 	
 	
 	
 	
 	
rR   c                 V    |                      |t          t          t                     d S r{  )ry  rW   completion_validatorUNUSED_CHAT_COMPLETION_FIELDSr~  s     rS   r
  z&Serve.validate_chat_completion_request  s5    >*7	 	 	
 	
 	
 	
 	
rR   c                 V    |                      |t          t          t                     d S r{  )ry  rY   transcription_validatorUNUSED_TRANSCRIPTION_FIELDSr~  s     rS   r"  z$Serve.validate_transcription_request  s5    8-5	 	 	
 	
 	
 	
 	
rR   r   r  contentr   rolefinish_reason
tool_callsdecode_stream	tokenizerr    c	                     ||||                     |j        |          }t          |t          t	          j                              |t          t          |||          d|          gdd          }	|	S )a  
        Builds a chunk of a streaming OpenAI Chat Completion response.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            request_id (`str`):
                The request ID.
            content (`str`, *optional*):
                Content of the response from the model.
            model (`str`, *optional*):
                The model that generated the content.
            role (`str`, *optional*):
                The role of the next content, until a new role is defined.
            finish_reason (`str`, *optional*):
                The reason the generation by the model has finished.
            tool_calls (`list[ChoiceDeltaToolCall]`, *optional*):
                Data about the tool calls, when they are triggered.

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        N)r  r  r  r   )deltaindexr  r   zchat.completion.chunk)idcreatedr   choicessystem_fingerprintr(  )step
_tokenizerr/   r   timeChoiceChunkr0   )
r   r  r  r   r  r  r  r  r  chunks
             rS   build_chat_completion_chunkz!Serve.build_chat_completion_chunk  s    D $)<AV#(()=wGGG#	$$% '!#-  
 "/  
  "*!
 
 
& rR   r  c                 6    d|                      d           dS )a/  
        Builds an event of a streaming OpenAI Response model or a ChatCompletion chunk.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            chunk (`BaseModel` or `ChatCompletionChunk`):
                The response to build an event from. One of the multiple OpenAI Response output types

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        zdata: Texclude_nonez

)model_dump_json)r  s    rS   chunk_to_sse_elementzServe.chunk_to_sse_element  s'     G--4-@@FFFFrR   	cache_dirc           	      2   ddl m}m} g }t                              d           t          t          |           j                  D ]M}|j        dk    r|j	        }|
                                D ]!\  }}|j        }t          d |D             d          }	|	s*t          j        |	                                                                          }
t#          |
t$                    rd|
v s||
d         }|                                |                                t)          fd|D                       rYd	|j        v r|j                            d	          nd
}|j        |dk    rd| nd
z   }|                    ||d|j        d           #O|S )z2
        List LLMs and VLMs in the cache.
        r   !MODEL_FOR_CAUSAL_LM_MAPPING_NAMES*MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMESz/Scanning the cache directory for LLMs and VLMs.r   c              3   :   K   | ]}|j         d k    |j        V  dS )zconfig.jsonN)	file_name	file_path).0fs     rS   	<genexpr>z'Serve.get_gen_models.<locals>.<genexpr>  s1      #_#_A!+Q^B^B^AKB^B^B^B^#_#_rR   Narchitecturesc              3   ,   K   | ]}|g v 
|V  d S r   rQ   )r  archllmsvlmss     rS   r  z'Serve.get_gen_models.<locals>.<genexpr>  s5      PP4$9O9Ot9O9O9O9OPPrR   /r   main@)owned_byr  r(  r  )&transformers.models.auto.modeling_autor  r  r   warningr   r   repos	repo_typerefsr   filesnextr   r   openr  
isinstancerH  r   anyrepo_idsplitappendlast_modified)r  r  r  generative_modelsrepor  refrevision_infor  config_pathrT  r  authorrepo_handler  r  s                 @@rS   r*  zServe.get_gen_models  s   	
 	
 	
 	
 	
 	
 	
 	

 HIII	22899 	 	D~((9D&*jjll  "]%+"#_#_#_#_#_aeff" K$4$4$6$6$;$;$=$=>>"6400 _5N5N & 78??AAAHHJJPPPPPPPPPP 
8;t|8K8KT\//444QSF"&,sf}})c)))RT"UK%,,(."-&-'+'9	   '8 ! rR   r   c           	          
                       |d                    j        k    } _        |r* j        # j                            dd           d _                                       \  }}t          |d          r|j        n|t          ||j        j	        j
        ddd	           j        L|                    
           _        t                       j        _         j                                         |                    |d         ddd                              |j                  d         d          fd fd
 fd}
 fd} j                            |j        |                    d                    }|                    d          rt+           ||          d          S  ||          }|                    d          }	t/          |	d          S )a'  
        Generates an OpenAI Chat Completion using continuous batching.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        r   NTr   r   r  Ffifo)r   eos_token_idpad_token_id	use_cacher   	scheduler)rI   messagespt)return_tensorsadd_generation_promptreturn_dict	input_idsr   c              3     K   ddl m} 	                     | d          V  d}j                            |           D ]}|dz  }|j        r)|j        d         }                    | ||          V  |j        |j        k    rN|
j        k    }t          d	          r|j
        k    }|o| }|rd
nd}                    | |          V   d S d S # t          $ ra}	t                              t          |	                     j                            |            dt          |	           dV  Y d }	~	d S d }	~	ww xY w)Nr   )RequestStatus	assistantr  r   r      r   )r  r  r   r  r  	eos_tokenlengthrl   r  r   data: {"error": ""})generation.continuous_batchingr  r  r   request_id_itergenerated_tokensr-  FINISHEDr   r   r  	Exceptionr   rq  rO   cancel_request)r  r  r  n_tokens_generatedresulttoken_idgenerated_all_tokensfinal_token_is_eosreasonrw  rI   rS  r   r  s             rS   stream_chat_completionzIServe.continuous_batching_chat_completion.<locals>.stream_chat_completion>  s     FFFFFF(7 66z[p6qqqqq%&""FVVWabb  F&!+& . #)#:2#>">>'1$,"7*7&/ ?      }(>>>/AEVEe/e, #9k:: c179;N1N.3G3bPbLb0-A!Mv">>&*0"7 ?     
  ? >  7 7 7SVV$$$8GG
SSS63q666666666666667s   C	C C 
E#AD??Ec                    d }j                                         r9|7j                             | d          }j                                         r|7                    |j                  }t          | t          t          j                              dt          dt          |d          d          g	          }|S )
Nr  )r  r   chat.completionr   r  r  r  rl   r  messager  )r  r  r(  r   r  )
r   
is_running
get_resultdecoder  r+   r   r  r.   r,   )_request_idr  r  chat_completion_resultrS  r   r  s       rS   buffer_chat_completionzIServe.continuous_batching_chat_completion.<locals>.buffer_chat_completionk  s    F:EEGG pFNALLXcmnLoo :EEGG pFN  &&v'>??G%3DIKK(((+ 5gK X X X&,	  & & &"" *)rR   c                p  K   	 t                                          d          } | |          D ]4}                    |          W V  t          j        d           d {V  5d S # t          j        $ r< j                            |            t          	                    d|  d           Y d S w xY w)NFr   Request  was cancelled.)
r   tolistr  rW  sleepCancelledErrorr   r  r   r  )r  r  _chunkinputsr   r  s      rS   cancellation_wrapper_streamzNServe.continuous_batching_chat_completion.<locals>.cancellation_wrapper_stream  s      H ,V]]__e D D44[-PP + +F33F;;;;;;!-**********+ + ) H H H8GGTTTF+FFFGGGGGGHs   A#A* *AB54B5c                     	  |           S # t           j        $ r< j                            |            t                              d|  d           Y d S w xY w)Nr  r  )rW  r  r   r  r   r  )r  r  r   s    rS   cancellation_wrapper_bufferzNServe.continuous_batching_chat_completion.<locals>.cancellation_wrapper_buffer  sz    H--k:::) H H H8GGTTTF+FFFGGGGGGHs   
 AAAr[   )r  r   	streamingr  r  r  application/json)rB  rA  r   rl   rC  r   r  r   rI   r  r  init_continuous_batchingr   logit_processorr   apply_chat_templatetor   add_requestr   r   r(   r  r'   )r   r   r  must_discard_cacher   r   r  r  r  
json_chunkr  rI   r  rS  r  r  s   `         @@@@@@rS   r  z)Serve.continuous_batching_chat_completion  s    !% 7 7G E E2doE/  	@7C8==DRS=TTT;?8889NOOy+29k+J+JYI''PY	=$)$;"/"/
 
 
 3;7<7U7U"3 8V 8 8D4
 H[G\G\D4D4::<<< ..
ODZ^ / 
 

"U\

;(()++	7 +	7 +	7 +	7 +	7 +	7 +	7 +	7Z	* 	* 	* 	* 	* 	* 	*4		H 		H 		H 		H 		H 		H 		H	H 	H 	H 	H 	H 	H =IIz:K:Zfifmfmnvfwfw J 
 

 778 	K$%@%@%L%LYlmmmm//
;;E..D.AAJ
7IJJJJrR   r   c                 $   |!t          |t                    rt          j        S ddlm}m} | j        j        }||	                                v rt          j
        }n5||	                                v rt          j        }nt          d|           |S )Nr   r  zUnknown modality: )r  r   r   r   r  r  r  	__class__rK   r   r   rc  )r   r   r  r  model_classnamemodalitys         rS   get_model_modalityzServe.get_model_modality  s     )%<== $|#	
 	
 	
 	
 	
 	
 	
 	

  /2HOOQQQQ|HH A H H J JJJ|HHC/CCDDDrR   r  c           	      N   g }| D ]}|d         g d}|t           j        k    rt          |d         t                    r	|d         }ndt          |d         t                    rIg }|d         D ])}|d         dk    r|                    |d                    *d                    |          }||d<   n\|t           j        k    rKt          |d         t                    r&|d                             d|d         d           n
|d         D ] }|d         dk    r|d                             |           +|d         dk    rd	|d         d
         v rt          j	        dd|d         d
                   }t          j        t          t          j        |                              }t          j        dd          }	|	j        }
|                    |	j                   n|d         d
         }
|d                             d|
d           |                    |            |S )Nr  r  r  r  typer^    )r  r^   	image_urlbase64urlz^data:image/.+;base64,r   z.pngF)suffixdeleteimage)r  r  )r   r   r  rO   r'  r  rf  r   resubr#   r  r   r  	b64decodetempfileNamedTemporaryFiler_  save)r  r  processor_inputsr  parsed_messageparsed_contentr  
image_datar  rZ   r  s              rS   *get_processor_inputs_from_inbound_messagesz0Serve.get_processor_inputs_from_inbound_messages  sN    '	4 '	4G&-fo"EEN8<'' gi0#66 >%,Y%7NN	 2D99 >%'N#*9#5 C C"6?f44*11'&/BBB%(XXn%=%=N,:y))X\)) gi0#66 \"9-44fgV_N`5a5abbbb#*9#5 \ \"6?f44*95<<WEEEE$V_;;'7;+?+FFF-/V4LbRYZeRfglRm-n-n
(-
76;KJ;W;W3X3X(Y(Y'/'B&Y^'_'_'_&*i %

49 5 5 5 5&-k&:5&A*95<<gVY=Z=Z[[[##N3333rR   c                      j         
 j         |d<   |d         }|d         d         dk    rdS                      |d                    j        k    } _                                       \  }                     |          }                     ||          }dt          D ],}|j        j        d         	                                v r| n-|
                    |d	|                    d
          dd	d	          }|                    j                  }|                    dd          d	}	dj        j        d         	                                v rd}	t          ||	d	          }
t          |j                  d}                     |          r9|s7 j                                        }|d         j        d         |k    r j        }i ||
d	|d fd}|                    d          r.t+          t-           j         ||
                    d          S g }d} ||
          }d}|D ]j}|j        d         }t3          |j        dd          r|                    |j        j                   |j        r|j        }t3          |dd          r|j        }kt?          tA          tC          j!                              dtE          dtG          d$                    |          d          |          g|           }|%                    d	!          }tM          |d"          S )#a  
        Generates an OpenAI Chat Completion using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        Nr   r  r   r  r  )r   r   Ttoolsr  )r  r#  r  r  tokenizer  req_0gptossFskip_special_tokensskip_promptr   r  )streamerrI   return_dict_in_generatepast_key_valuesc              3     K   d}d }dj         j        d                                         v rd}d}fd}t          |          }d}	 |                                 t                      }                    d	
          V  d}d}	| D ]R}|	dz  }	dj         j        d                                         v r|                    d          }||z  }|r||v rd}QR|                                t                   d         k    rd|_
        |                                t                   d         k    r0|                                                     |d d          V  |j
        rK|xj        |z  c_        |j        s_t          j        d|j                  }
|
|
                    d          }
d|_        t#          t%          |
          dd|dz             }n|dk    red|j        vrp|xj        |                    d          z  c_        |xj        |                    d          z  c_        |j        dk     r3d                    |                    d          d d                   dz   }t#          t%          |          dd          }                    |d |g          V  2|dk    r                    ||          V  T|	j        k    }t1          | j        d          r|| j        j        k    }|o| }|rdnd }                    ||!          V  |                                 nS# t6          $ rF}t8                              t=          |                     d"t=          |           d#V  Y d }~nd }~ww xY w|                                 d S # |                                 w xY w)$NFr&  r   T<|channel|>final<|message|>c                  :     j         di | }|j        _        d S NrQ   generater-  r@  r   generate_outputr   r   s     rS   generate_with_cachez[Serve.generate_chat_completion.<locals>.stream_chat_completion.<locals>.generate_with_cache?  +    "0%.":":6":":%4%D"""rR   r^  r   r   r  r  r  
<|return|>r   r   r  )r  r  r  r   z\"name\": \"(.*?)\")r_  function
_tool_call)r:  r  r  r  z"arguments": {{})	arguments)r:  r  r  )r  r  r  r   )r  r   r  r  rl   r  r  r  )rT  r  r>  r   r   r   r  removesuffixstrip_TOOL_CALL_TOKENSr   r   r   r   r  searchgroupr1   r2   r   countrf  r  r   r   r  r  r  r   rq  rO   )r+  r  
filter_cotcot_trace_endr6  threadresults
tool_stater  r  	tool_nametoolr  r  r  rw  rI   generation_kwargsr   rS  r  r   tool_model_familys                   rS   r  z>Serve.generate_chat_completion.<locals>.stream_chat_completion5  s      J M5<5a8>>@@@@!
 =E E E E E E #6?PQQQFGy&[[
 66z[p6qqqqq%&"& [ [F&!+&  5<#=a#@#F#F#H#HHH!'!4!4\!B!Bv%G " %(G33).J$$ )4!<<>>->?P-QRY-ZZZ:>J7$ "<<>>->?P-QRW-XXX&,,..."&"B"B+6%).:&;	 #C # #    %%6 1%&--7-- $.#C $",.I6LjN_,`,`	#,#4$,090B0BICG
 @':-Hi-X-X-X*+)3'2\'A	(" (" (" $*R<<$, $4:;L#L#L$, !+ < <S@Q@Q Q < < * < <S@Q@Q Q < <#-#?!#C#C-/WWV\\#5F5Fss5K-L-Ls-RF':-HSY-Z-Z-Z*+)3(" (" (" #'"B"B+6%),06&;	 #C # #    % ||">>'?T ?      (:=N=]']$ 8-{;; [)/83E3O)O&+?+ZHZDZ(%9Ev66{RX`u6vvvvv 7 7 7SVV$$$63q66666666666667
 s1   K?M N5 
N<NN5 NN5 5Or[   r  r  rl   r  usager  r   r  r  )r  r  r(  r   r  rO  r  r   )'r   rB  rA  rC  r  r!  _MODELS_WITH_TOOL_SUPPORTrT  r  r>  r  r   r  r   r   r   rI   is_continuationr@  get_seq_lengthshaper(   mapr  r  getattrr  r  r  r  rO  r+   r   r  r.   r,   rf  
model_dumpr'   )r   r   r  r  r   r  r  supported_model_familiesr  r(  generation_streamerr@  seq_lenr  r  r  	generatorrO  r  choicer  r  rI   rM  r   rS  r  rN  s   `                     @@@@@@rS   r  zServe.generate_chat_completion  s    '+CL9<Z B<;..F $ 7 7G E E2doE/889NOOy**5I*FFJJ8U]^^ !(A 	 	$'5<+Ea+H+N+N+P+PPP$<! Q .."&'''"" / 
 
 5<((WW\733
 #u|1!4::<<<<"'2 3
 
 

 >c[`[rsss$$ 	3-? 	3(7799Gk"(,w66 $ 2

+!2'+,
 
 
J	 J	 J	 J	 J	 J	 J	 J	 J	 J	 J	X 778 (	G$D-/E/EFY[e/f/fgg.   
 G"M../BJOOIE" ( (q)6<D99 9NN6<#7888' 9$*$8M5'400 (!KE%3DIKK(((+ 5bggg>N>NU` a a a&3	   & & &"" ,66D6IIF3EFFFFrR   c                                           d                    j        k    } _                                       \  }t          d         t                    r1dv rdd         dgng }|                    dd         d           nt          d         t                    rTdv rGd         d         d         dk    rdd         dgd         }n{d         }d         |d         d	<   nad         }nXt          d         t                    r.dv rdd         dgng }|                    d                    nt          d
          |	                    |ddd          d         }|
                    j                  }                    dd          d}dj        j        d                                         v rd}t!          ||d          }t#          j                  }d}                               r9|s7 j                                        }	|d         j        d         |	k    r j        }|t/          |          ||d|d fd}
 |
|          S )a	  
        Generates an OpenAI Response using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Response for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Response events.
        r   inputinstructionssystemr  r_   r   r  r  %inputs should be a list, dict, or strTr  r  r  r  r  rj   r%  r&  Fr'  r*  Nr   )r  attention_maskr+  rI   r,  r-  c              3     K   d}d }dj         j        d                                         v rd}d}fd}t          |          }d}d}d}	 |                                 t          j                    }	t          d|t          d	 |	d
                    d          dddiidg g                     dd          d                    d                              }
|dz  }	                    |
          V  t          d|t          d	 |	d                    d          dddiidg g                     dd          d                    d                              }|dz  }	                    |          V  t          d||t          d dddg                     }|dz  }	                    |          V  t          dd |||t          dd g !          "          }|dz  }	                    |          V  d }| D ]}dj         j        d                                         v r|                    d#          }||z  }|r?||v rd}d }Mt!          d$d ||||g %          }|dz  }	                    |          V  |r5t!          d$d ||||g %          }|dz  }	                    |          V  t#          d&d ||d|g '          }|dz  }	                    |          V  t%          d(d |||t          d|j        g !          "          }|dz  }|dz  }	                    |          V  t)          d)||t          d dd*d|j        gg +                    }|dz  }|dz  }	                    |          V  t-          d,|t          d	 |	d*                    d          dddii|j        gdg                     dd          d                    d          -                    }|dz  }	                    |          V  |                                 n# t2          $ r }t4                              d.t9          |                      t;          d/|t9          |          0          }|dz  }	                    |          V  t=          d1|t          d	 |	d2                    d          dddiig dg dd                    d          t?          d3t9          |          4          5                    }|dz  }	                    |          V  Y d }~nd }~ww xY w|                                 d S # |                                 w xY w)6NFr&  r   Tr/  c                  :     j         di | }|j        _        d S r1  r2  r4  s     rS   r6  zMServe.generate_response.<locals>.stream_response.<locals>.generate_with_cache8  r7  rR   r8  zresponse.createdresp_queuedr^  formatr  r^   r7  ry   r   ro   )r  
created_atr-  r   r^  r^   r(  r#  r  ry   rf   ro   )r  sequence_numberr7  r  zresponse.in_progressin_progresszresponse.output_item.addedmsg_r  r  )r  r  r-  r  r  )r  ri  output_indexitemzresponse.content_part.addedoutput_textr   r  r^   annotations)r  item_idri  rl  content_indexpartr9  zresponse.output_text.delta)r  rq  ri  rl  rr  r  rn   zresponse.output_text.done)r  rq  ri  rl  rr  r^   rn   zresponse.content_part.donezresponse.output_item.done	completedr  r  r-  r  r  rp  zresponse.completedr  rh  r-  r   r^  r^   r  r(  r#  ry   rf   ro   z"Exception in response generation: rq  )r  ri  r  zresponse.failedfailedserver_error)coder  )r  rh  r-  r   r^  r^   r  r(  r#  ry   rf   ro   rq  ) rT  r  r>  r   r   r  r8   r4   r   r  r<   r=   r?   r6   r@   r@  rA   rB   r7   r^   r>   rs  r5   rm  rf  r  r   rq  rO   r:   r;   r9   )r+  r  rF  rG  r6  rH  ri  rl  rr  rh  response_createdresponse_in_progressresponse_output_item_addedresponse_content_part_addedrI  r  response_output_text_deltaresponse_output_text_doneresponse_content_part_doneresponse_output_item_doneresponse_completedrw  error_eventresponse_failedrM  r   rS  r   r  r   s                           rS   stream_responsez0Serve.generate_response.<locals>.stream_response.  s      J M5<5a8>>@@@@!
 =E E E E E E #6?PQQQFOLMY!Y[[
 $8+$3%/://#-'3%(WW^%<%<&(89) !,/GG4I5,Q,Q$*!$!4!4  $ $ $ $  1$//0@AAAAA'>/$3%/://#-,3%(WW^%<%<&(89) !,/GG4I5,Q,Q$*!$!4!4  ( ( ($$  1$//0DEEEEE .J5$3!-..*..Y}[fpr  	. . .*  1$//0JKKKKK /L6/://$3!-"/+RUWXXX/ / /+  1$//0KLLLLL & %X %XF5<#=a#@#F#F#H#HHH!'!4!4\!B!Bv%G " X(G33).J&(G$9O%A(;z(;(;0?-9.;&,)+: : :6 ,q0O"&";";<V"W"WWWWW " X9O%A(;z(;(;0?-9.;&,)+: : :6 ,q0O"&";";<V"W"WWWW -B4/://$3!-"# - - -)  1$//0IJJJJJ .J5/://$3!-"/+E^Ecqsttt. . .*  1$"//0JKKKKK -H4$3!-..*..&*(!;!@ A$&  	- - -)  1$!//0IJJJJJ &<-$3%/://#-*3%(WW^%<%<&(89 9 >?) ,/GG4I5,Q,Q$*!$!4!4  & & &"$  1$//0BCCCCC !A !A !AJ#a&&JJKKK0 $3FF  
  1$//<<<<<"5*$3%/://#-'3%(WW^%<%<&(89!) ,1$*!$!4!4+!/$'FF    # # #,  1$//@@@@@@@@@@C!AH s2   O3Q U) UC6UU) UU) )U?)rB  rA  rC  r  rO   r  r'  rH  	TypeErrorr  r  r   r   rT  r  r>  r   r   rI   rQ  r@  rR  rS  r   )r   r   r  r   r  r(  rX  rI   r@  rY  r  rM  r   rS  r  s   ``         @@@@rS   r  zServe.generate_response  s&    !% 7 7G E E2doE/889NOOyc'lC(( 	EM[_bMbMbxC4GHHIIhjFMM6c'lCCDDDDGd++ 	E$$w<?6*h66'/C<OPP`SVW^S_`FF \F+.~+>F1Ii((WGd++ 	EM[_bMbMbxC4GHHIIhjFMM#g,''''CDDD..$tQU / 
 

 5<((WW3W==
 #u|1!4::<<<<"'2 3
 
 

 >c[`[rsss$$ 	3-? 	3(7799Gk"(,w66 $ 2 -f55+!2'+,
 
l	 l	 l	 l	 l	 l	 l	 l	 l	 l	\ 2J???rR   c                 l   |                      |d                   }|| j        k    }|| _        |                     |          \  }}t          |d         t                    r1d|v rd|d         dgng }|                    d|d         d           nt          |d         t                    rTd|v rG|d         d         d         dk    rd|d         dg|d         }n{|d         }|d         |d         d	<   na|d         }nXt          |d         t                    r.d|v rd|d         dgng }|                    |d                    nt          d
          |	                    |ddd          d         }|
                    |j                  }|                    dd          }d}d|j        j        d                                         v rd}t!          ||j                  }	d}
|                     |          r3|s1| j                                        }|j        d         |k    r| j        }
|                    |t/          |          |	d|
          }|j        | _        |                    |j        |          d         }t7          j                    }t9          d| dddt;          d|g           gg           }t=          d| |d||                    d          d d!d"ii|gd#g |                    d$d          d%|                    d&          '          }|                    d(          S ))a  
        Generates an OpenAI Response in non-streaming mode (single JSON payload).

        Args:
            req (`dict`): The request to generate an OpenAI Response for.

        Returns:
            `dict`: The OpenAI `Response` serialized as a dict.
        r   r]  r^  r_  r  r_   r   r  r  r`  Tr  ra  r  rj   r%  r&  Fr*  Nr   )r  rb  rI   r,  r-  r(  rk  r  rt  r  rn  ro  ru  re  rg  r  r^   r7  ry   r   ro   rv  r  ) rB  rA  rC  r  rO   r  r'  rH  rc  r  r  r   r   rT  r  r>  r   rI   rQ  r@  rR  rS  r3  r   r-  batch_decode	sequencesr  r?   r@   r4   rV  )r   r   rS  r  r   r   r  r  r(  rI   r@  rY  r5  	full_textrh  response_output_itemr  s                    rS   r  z%Serve.generate_response_non_streaming  s    !% 7 7G E E2doE/889NOOyc'lC(( 	FM[_bMbMbxC4GHHIIhjFMM6c'lCCDDDDGd++ 	F$$w<?6*h66'/C<OPP`SVW^S_`FF \F+.~+>F1Ii((WGd++ 	FM[_bMbMbxC4GHHIIhjFMM#g,''''DEEE..$tQU / 
 

 5<((WW3W==
 #u|1!4::<<<<"'=c[`[rsss$$ 	3-? 	3(7799G|B')) $ 2..*622/$() ) 
 
 -< **?+DZm*nnopq	Y[[
4"j""']XZ[[[\ 
  
  
 &#z##!'00VV,-() #(=u E EWWZ((
 
 
 ",,$,???rR   c                 F  
 t                      st          d          |                     |d                   }|                     |          \  t	          j        dd          }t          |j                  }j        j	        }t          j        |d                   }t          j        ||d          \  }} ||d	                              j                  

d
                             j                  
d
<   ||dd
fd}	 |	            S )a  
        Generates an OpenAI Transcription using the audio file.

        Args:
            req (`dict`): The request containing the audio file and model information.

        Returns:
            `Generator[str, None, None]`: A generator that yields the transcription result.
        z]Missing librosa dependency for audio transcription. Please install with `pip install librosa`r   Tr'  r*  rZ   )srmonor  )sampling_rater  input_features)r+  rI   r,  c               3      K    j         di }                     | j        d          d         }t          |          }|                    d           V  d S )NTr  r   )r^   r  rQ   )r3  r  r  r)   r  )generated_idstranscription_texttranscriptionaudio_inputsaudio_modelaudio_processorrM  s      rS   _generate_transcriptionz=Serve.generate_transcription.<locals>._generate_transcription  s      0K0UU<UCTUUM!0!=!=m>Uko!=!p!pqr!s)/ABBBM"222EEGGGGGGrR   )r   r:  rB  load_audio_model_and_processorr   r  r   rI   feature_extractorr  ior   librosaloadr  r   r   )r   r   rS  rX  rI   model_sampling_rateaudio_bytesaudio_array_r  r  r  r  rM  s             @@@@rS   r#  zServe.generate_transcriptionx  s    $%% 	o   !% 7 7G E E'+'J'JK`'a'a$_2%4T
 
 
 >)F
 
 

 .?MjV-- k6IPTUUUQ&{BUfjkkknn
 
 *66F)G)J)J;K\)])]%& ,!2'+
 
	H 	H 	H 	H 	H 	H 	H 	H '&(((rR   c                 N   |                     d          p|                     d          }d}| j        d}ngt          | j                  t          |          k    rd}n?t          t          | j                            D ]}| j        |         ||         k    rd} n|| _        |S )aD  
        Determines whether the current request is a continuation of the last request. In other words, if it is the
        same chat session.

        Args:
            req (`dict`): The request to check.

        Returns:
            `True` if the request is a continuation of the last request, `False` otherwise.
        r  r]  TNF)r   r?  lenrange)r   r   r  req_continues_last_messagesis        rS   rQ  zServe.is_continuation  s     77:&&:#'''*:*:&*# %*/''#$$H55*/'' 3t12233  %a(HQK7727/E 8 &**rR   c                     | j         dk    rt          ddd          }n| j         dk    rt          d          }nd}|t                              d|            |S )	z
        Returns the quantization config for the given CLI arguments.

        Returns:
            `Optional[BitsAndBytesConfig]`: The quantization config.
        zbnb-4bitTnf4)load_in_4bitbnb_4bit_quant_typebnb_4bit_use_double_quantzbnb-8bit)load_in_8bitNz0Quantization applied with the following config: )r   r   r   r   )r   quantization_configs     rS   get_quantization_configzServe.get_quantization_config  s     
**"4!$)*.# # #
 *,,"4$"G"G"G"&*KK`K^``aaa""rR   model_idc                 4    | j         | j         }d|v r|S | dS )aR  
        Applies the `force_model` CLI argument and canonicalizes the model name to the format "model_id@revision".
        If the model_id DOESN'T contain an @, it defaults to "model_id@main".

        Args:
            model_id (`str`): The model ID.

        Returns:
            `str`: The canonicalized model name to be used
        Nr  z@main)r   )r   r  s     rS   rB  zServe.process_model_name  s2     ''H(??O!!!!rR   rS  c                 `   ddl }ddlm}m} t                              d|            d|v r|                    dd          \  }}n|d}}	 |                    ||| j                  }nK# t          $ r> 	 t          j        ||| j                  }n# t          $ r t          d	          w xY wY nw xY w| j        d
v r| j        nt          || j                  }|                                 }	|| j        || j        | j        |	d}
 |j        |fi |
}t          t          |j        d                   } |j        |fi |
}|j        j        du o|j        j        dk    }|j        j        duo|j        j        dk     }|s|rd|j        _        t                              d|            ||fS )a  
        Generic method to load a model and a data processor from a model ID and revision, making use of the serve CLI
        arguments.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.
            model_cls (`type[PreTrainedModel]`):
                The model class to load.

        Returns:
            `tuple[PreTrainedModel, Union[ProcessorMixin, PreTrainedTokenizerFast]]`: The loaded model and
            data processor (tokenizer, audio processor, etc.).
        r   N)
AutoConfigAutoProcessorzLoading r  r  r  )revisionr   zBFailed to load processor with `AutoProcessor` and `AutoTokenizer`.)r   N)r  r   r   
device_mapr   r     r  zLoaded model )r   r   r  r  r   r   r  from_pretrainedr   OSErrorr   r   rU  r  r   r   r  rI   r   
max_length)r   rS  r   r  r  r  r  data_processorr   r  model_kwargsrT  architecturer   has_default_max_lengthhas_short_max_new_tokenss                   rS   _load_model_and_data_processorz$Serve._load_model_and_data_processor  s_    	::::::::6466777'''!6!<!<S!!D!DHhh!6hH	d*::!"&"8 ;  NN
  	d 	d 	dd!.!>%&*&<" " "
  d d dbcccd 	d #jN::

tz@Z@Z"::<< !#'#;+!%!7#6
 
 ,+HEEEE|V-A!-DEE,,XFFFF #2d:gu?V?aeg?g 	 #2$>p5CZCilpCp 	! " 	:%= 	:59E#2;$9;;<<<n$$s*   A+ +
B36BB3B--B32B3)r   r    c                 P   || j         vs| j         |                                         r8|                     |          \  }}t          || j        |          | j         |<   nC| j         |                                          | j         |         j        }| j         |         j        }||fS )a\  
        Loads the text model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, PreTrainedTokenizerFast]`: The loaded text model and processor.
        r   r   r   r   r  r   r   r   r   r   )r   rS  r   r   s       rS   rC  zServe.load_model_and_processor4  s     !(:::d>PQf>g>r>r>t>t:#BBCXYYE98B $ 2#9 9 9D455 45AACCC&'<=CE*+@AKIirR   )r   r!   c                 P   || j         vs| j         |                                         r8|                     |          \  }}t          || j        |          | j         |<   nC| j         |                                          | j         |         j        }| j         |         j        }||fS )aU  
        Loads the audio model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, ProcessorMixin]`: The loaded audio model and processor.
        r  r  )r   rS  r  r  s       rS   r  z$Serve.load_audio_model_and_processorO  s     !(:::d>PQf>g>r>r>t>t:+/+N+NOd+e+e(K8B $ 2)9 9 9D455 45AACCC,-BCIK"01FGQOO++rR   )Fr   r   FNNr   r   r   r   NFFNF)r   NNNNNNNr   )3rK   rL   rM   r
   r]   typerOptionrO   r   r   rO  rg  rH  r   rE   rn  ry  r  r
  r"  r'  r1   r   r   r/   r  staticmethodrD   r  r   r  r*  r(   r'   r  r   r  r!  r  r   r  r  r#  rQ  r   r  rB  r  tuplerC  r  rQ   rR   rS   r   r   h  sX             ZeUY    ns  qF F&,%,$^____
F
 EL~  
F $JEL x  
F" %,%,$XYYYY
#F( '$JEL Z  
)F4  $JELabbbd
5F< \U\/UVVVVW=F> \U\/PQQQQR?F@ !#effff
AFF #]^^^^
GFL  $J*]^^^^
MFR EL L  
SF^ $D,%,<i*j*j*j$jk_F` $JEL X  
aFl  ,%,d1bcccc
mFr 
sF F F FP  	) 	) 	)// / 	/
 / / / /b
 
 
 
 

 
 
 
 

d 
 
 
 
 " $(7;-19=8 88 t8 Tz	8
 Dj8 Tz8 ,-48 $d*8 568 
8 8 8 8t G$7)$C G G G G \G  -! -!#* -!T#s(^8L -! -! -! Y \-!^TKt TK TKQbeqQq TK TK TK TKl  "3     \( + x +  +  +  \+ ZCGD CG5F5U CG CG CG CGJq@T q@iT4.H q@ q@ q@ q@f	X@4 X@D X@ X@ X@ X@t.)$ .)9S$_3M .) .) .) .)`+4 +D + + + +<#);d)B # # # #."3 "3 " " " ""D%C D% D% D% D%L %( 	;	<       6,C ,ERuLv , , , , , ,rR   r   a  
Run a FastAPI server to serve models on-demand with an OpenAI compatible API.

Models will be loaded and unloaded automatically based on usage and a timeout.


The server will expose the following endpoints:
    - POST /v1/chat/completions: Generates chat completions.
    - POST /v1/responses: Generates responses.
    - POST /v1/audio/transcriptions: Generates transcriptions from audio.
    - GET /v1/models: Lists available models for 3rd party tools.

Requires FastAPI and Uvicorn to be installed.
__main__)rW  r  r   enumr   r  r   r  r  r   r  r4  collections.abcr   r   
contextlibr   	functoolsr   r   r   typingr	   r
   r   r   r   r  huggingface_hubr   tokenizers.decodersr   r   r   r   r   r   r   transformers.utils.import_utilsr   r   r   r   r   r   r   r   r   utilsr   r   r    r!   r  r"   r  PILr#   r9  rK  rF  r$   r%   fastapi.middleware.corsr&   fastapi.responsesr'   r(    openai.types.audio.transcriptionr)   .openai.types.audio.transcription_create_paramsr*   openai.types.chatr+   r,   r-   !openai.types.chat.chat_completionr.   'openai.types.chat.chat_completion_chunkr/   r0   r1   r2   r  *openai.types.chat.completion_create_paramsr3   openai.types.responsesr4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   -openai.types.responses.response_create_paramsrC   pydanticrD   rE   rF   rH   rW   rY   r|  r  r  r}  r  r  r;  rK   r   rB  r'  ro  rP  r3  r   r   r   Enumr   rH  r   r   r   r   rN   r\  rQ   rR   rS   <module>r     sZ       				 				  				        / / / / / / / / * * * * * *                   G G G G G G G G G G G G G G  * * * * * * , , , , , ,           e e e e e e e e e e e e                              K          KJJJJJ  NNN  k 4 4 6 6k;O;O;Q;QkViViVkVk    tNNN........666666AAAAAAAA>>>>>>\\\\\\cccccccccc888888                 [ZZZZZ                                 " \[[[[[@@@@@@@@@@    4QY^        6U]b        0MUZ     %%NOO&;'RSS)k*OPP   % % %!.# # # 
	H	%	%
    !D!2!7!7!9!9::   ! ! !* * *    ty   8	8-8 	8 8 8 8v       1@ 1@ 1@ 1@ 1@ 1@ 1@ 1@h~, ~, ~, ~, ~, ~, ~, ~,D0 zEGGEEE rR   