diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py index 2c2f0f6c201..2d0fb38eebf 100644 --- a/fastdeploy/engine/request.py +++ b/fastdeploy/engine/request.py @@ -75,6 +75,7 @@ def __init__( pooling_params: Optional[PoolingParams] = None, multimodal_inputs: Optional[dict] = None, multimodal_data: Optional[dict] = None, + mm_processor_kwargs: Optional[dict] = None, disable_chat_template: bool = False, disaggregate_info: Optional[dict] = None, draft_token_ids: Optional[list[int]] = None, @@ -133,6 +134,7 @@ def __init__( # Multi-modal related self.multimodal_inputs = multimodal_inputs self.multimodal_data = multimodal_data + self.mm_processor_kwargs = mm_processor_kwargs self.multimodal_img_boundaries = None self.enable_thinking = enable_thinking @@ -226,6 +228,7 @@ def from_dict(cls, d: dict): eos_token_ids=d.get("eos_token_ids"), multimodal_inputs=d.get("multimodal_inputs"), multimodal_data=d.get("multimodal_data"), + mm_processor_kwargs=d.get("mm_processor_kwargs"), disable_chat_template=d.get("disable_chat_template"), disaggregate_info=d.get("disaggregate_info"), draft_token_ids=d.get("draft_token_ids"), @@ -298,6 +301,7 @@ def to_dict(self) -> dict: "tools": self.tools, "eos_token_ids": self.eos_token_ids, "multimodal_data": self.multimodal_data, + "mm_processor_kwargs": self.mm_processor_kwargs, "disable_chat_template": self.disable_chat_template, "disaggregate_info": self.disaggregate_info, "draft_token_ids": self.draft_token_ids, diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py index 318b46b22d7..ab9498a954f 100644 --- a/fastdeploy/entrypoints/openai/protocol.py +++ b/fastdeploy/entrypoints/openai/protocol.py @@ -712,6 +712,7 @@ class ChatCompletionRequest(BaseModel): guided_regex: Optional[str] = None guided_choice: Optional[list[str]] = None guided_grammar: Optional[str] = None + mm_processor_kwargs: Optional[dict] = None return_token_ids: Optional[bool] = None prompt_token_ids: Optional[List[int]] = None diff --git a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py index 7909a5dab21..3dd3cdbb86b 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py +++ b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py @@ -217,12 +217,13 @@ def process_request_dict(self, request, max_model_len=None): bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids) request["bad_words_token_ids"] = bad_words_token_ids + processor_kwargs = self._parse_processor_kwargs(request.get("mm_processor_kwargs")) if request.get("prompt_token_ids"): messages = request.get("messages") if messages: self._check_mm_limits(messages) request.setdefault("enable_thinking", True) - outputs = self.ernie4_5_processor.prompt_token_ids2outputs(request) + outputs = self.ernie4_5_processor.prompt_token_ids2outputs(request, **processor_kwargs) elif request.get("prompt"): multimodal_data = request.get("multimodal_data") if multimodal_data is None: @@ -231,7 +232,7 @@ def process_request_dict(self, request, max_model_len=None): images = multimodal_data.get("image", None) videos = multimodal_data.get("video", None) request["prompt_tokens"] = request.get("prompt") - outputs = self.ernie4_5_processor.text2ids(request["prompt"], images, videos) + outputs = self.ernie4_5_processor.text2ids(request["prompt"], images, videos, **processor_kwargs) elif request.get("messages"): messages = request["messages"] self._check_mm_limits(messages) @@ -243,7 +244,7 @@ def process_request_dict(self, request, max_model_len=None): request[k] = v else: raise ValueError("Invalid input: chat_template_kwargs must be a dict") - outputs = self.ernie4_5_processor.request2ids(request) + outputs = self.ernie4_5_processor.request2ids(request, **processor_kwargs) else: raise ValueError(f"Request must contain 'prompt', or 'messages': {request}") diff --git a/fastdeploy/input/ernie4_5_vl_processor/process.py b/fastdeploy/input/ernie4_5_vl_processor/process.py index dd475c65d63..ab58708c36e 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/process.py +++ b/fastdeploy/input/ernie4_5_vl_processor/process.py @@ -34,6 +34,7 @@ from fastdeploy.input.mm_data_processor import MMBaseDataProcessor from fastdeploy.input.utils import IDS_TYPE_FLAG from fastdeploy.multimodal.hasher import MultimodalHasher +from fastdeploy.multimodal.utils import set_processor_kwargs from fastdeploy.utils import data_processor_logger from .image_preprocessor.image_preprocessor_adaptive import AdaptiveImageProcessor @@ -198,7 +199,8 @@ def eval(self) -> None: """Enable evaluation mode (doesn't produce labels).""" self.is_training = False - def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None): + @set_processor_kwargs + def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None, **kwargs): """ Convert chat text into model inputs. @@ -319,15 +321,16 @@ def extract_mm_items(self, request: Dict[str, Any]): raise ValueError(f"Unsupported multimodal type: {item.get('type')}") return images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items + @set_processor_kwargs def request2ids( - self, request: Dict[str, Any], tgts: List[str] = None + self, request: Dict[str, Any], tgts: List[str] = None, **kwargs ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]: """ Convert chat messages into model inputs. Returns a dict with input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels. """ images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = self.extract_mm_items(request) - + if self.tokenizer.chat_template is None: raise ValueError("This model does not support chat template.") @@ -361,8 +364,9 @@ def request2ids( return outputs + @set_processor_kwargs def prompt_token_ids2outputs( - self, request: Dict[str, Any], tgts: List[str] = None + self, request: Dict[str, Any], tgts: List[str] = None, **kwargs ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]: outputs = { "input_ids": [], diff --git a/fastdeploy/input/paddleocr_vl_processor/paddleocr_vl_processor.py b/fastdeploy/input/paddleocr_vl_processor/paddleocr_vl_processor.py index a81a40d494b..4bada3915c1 100644 --- a/fastdeploy/input/paddleocr_vl_processor/paddleocr_vl_processor.py +++ b/fastdeploy/input/paddleocr_vl_processor/paddleocr_vl_processor.py @@ -214,6 +214,7 @@ def process_request_dict(self, request, max_model_len=None): # processing stop_sequences and stop_token_ids process_stop_token_ids(request, self.update_stop_seq) + processor_kwargs = self._parse_processor_kwargs(request.get("mm_processor_kwargs")) if request.get("prompt"): multimodal_data = request.get("multimodal_data") if multimodal_data is None: @@ -221,13 +222,11 @@ def process_request_dict(self, request, max_model_len=None): self._check_mm_limits(multimodal_data) images = multimodal_data.get("image", None) videos = multimodal_data.get("video", None) - outputs = self.processor.text2ids(request["prompt"], images, videos) - + outputs = self.processor.text2ids(request["prompt"], images, videos, **processor_kwargs) elif request.get("messages"): messages = request["messages"] self._check_mm_limits(messages) - outputs = self.processor.request2ids(request) - + outputs = self.processor.request2ids(request, **processor_kwargs) else: raise ValueError(f"Request must contain 'prompt', or 'messages': {request}") diff --git a/fastdeploy/input/paddleocr_vl_processor/process.py b/fastdeploy/input/paddleocr_vl_processor/process.py index 8090abff12b..176f0d71083 100644 --- a/fastdeploy/input/paddleocr_vl_processor/process.py +++ b/fastdeploy/input/paddleocr_vl_processor/process.py @@ -30,6 +30,7 @@ from fastdeploy.input.mm_data_processor import MMBaseDataProcessor from fastdeploy.input.utils import IDS_TYPE_FLAG from fastdeploy.multimodal.hasher import MultimodalHasher +from fastdeploy.multimodal.utils import set_processor_kwargs from fastdeploy.utils import data_processor_logger from .image_processor import ImageProcessor @@ -135,7 +136,8 @@ def calc_one(thw): return calc_one(grid_thw) - def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None): + @set_processor_kwargs + def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None, **kwargs): """ Convert text with image/video placeholders into model inputs. @@ -221,8 +223,9 @@ def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=N return outputs + @set_processor_kwargs def request2ids( - self, request: Dict[str, Any], tgts: List[str] = None + self, request: Dict[str, Any], tgts: List[str] = None, **kwargs ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]: """ Convert chat request with multimodal messages into model inputs. diff --git a/fastdeploy/input/qwen_vl_processor/process.py b/fastdeploy/input/qwen_vl_processor/process.py index 20b48b7f170..7b778fd1006 100644 --- a/fastdeploy/input/qwen_vl_processor/process.py +++ b/fastdeploy/input/qwen_vl_processor/process.py @@ -30,6 +30,7 @@ from fastdeploy.input.mm_data_processor import MMBaseDataProcessor from fastdeploy.input.utils import IDS_TYPE_FLAG from fastdeploy.multimodal.hasher import MultimodalHasher +from fastdeploy.multimodal.utils import set_processor_kwargs from fastdeploy.utils import data_processor_logger from .image_processor import ImageProcessor @@ -140,7 +141,8 @@ def calc_one(thw): return calc_one(grid_thw) - def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None): + @set_processor_kwargs + def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None, **kwargs): """ Convert text with image/video placeholders into model inputs. @@ -224,8 +226,9 @@ def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=N return outputs + @set_processor_kwargs def request2ids( - self, request: Dict[str, Any], tgts: List[str] = None + self, request: Dict[str, Any], tgts: List[str] = None, **kwargs ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]: """ Convert chat request with multimodal messages into model inputs. diff --git a/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py b/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py index 5e976bce4d2..c71ac9d52fe 100644 --- a/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py +++ b/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py @@ -219,6 +219,7 @@ def process_request_dict(self, request, max_model_len=None): bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids) request["bad_words_token_ids"] = bad_words_token_ids + processor_kwargs = self._parse_processor_kwargs(request.get("mm_processor_kwargs")) if request.get("prompt"): multimodal_data = request.get("multimodal_data") if multimodal_data is None: @@ -226,8 +227,7 @@ def process_request_dict(self, request, max_model_len=None): self._check_mm_limits(multimodal_data) images = multimodal_data.get("image", None) videos = multimodal_data.get("video", None) - outputs = self.processor.text2ids(request["prompt"], images, videos) - + outputs = self.processor.text2ids(request["prompt"], images, videos, **processor_kwargs) elif request.get("messages"): messages = request["messages"] self._check_mm_limits(messages) @@ -240,7 +240,7 @@ def process_request_dict(self, request, max_model_len=None): else: raise ValueError("Invalid input: chat_template_kwargs must be a dict") request.setdefault("enable_thinking", False) - outputs = self.processor.request2ids(request) + outputs = self.processor.request2ids(request, **processor_kwargs) else: raise ValueError(f"Request must contain 'prompt', or 'messages': {request}") diff --git a/fastdeploy/multimodal/utils.py b/fastdeploy/multimodal/utils.py index fa3ad4cbe22..8a95829317f 100644 --- a/fastdeploy/multimodal/utils.py +++ b/fastdeploy/multimodal/utils.py @@ -46,3 +46,29 @@ def _convert_transparent_paste(image): pass return ImageOps.exif_transpose(image) + +def set_processor_kwargs(func): + """ + set processor kwargs by requests + """ + def wrapper(self, *args, **kwargs): + original_kwargs = {} + for k, v in kwargs.items(): + if hasattr(self, k.replace("video_", "")): + k = k.replace("video_", "") + if hasattr(self, k): + if k.endswith("min_pixels"): + assert getattr(self, k) <= v, f"{k} should be larger than its initial value" + if k.endswith("max_pixels"): + assert getattr(self, k) >= v, f"{k} should be smaller than its initial value" + original_kwargs[k] = getattr(self, k) + setattr(self, k, v) + + ret = func(self, *args, **kwargs) + + for k, v in original_kwargs.items(): + setattr(self, k, v) + + return ret + return wrapper + diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 768e59b2460..52e57d239f8 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -2402,7 +2402,7 @@ class at the server level, which is too granular for ModelRunner. # 4. Compute logits, Sample logits = self.model.compute_logits(hidden_states) - + if not self.speculative_decoding: set_value_by_flags_and_idx( self.share_inputs["pre_ids"],