Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions fastdeploy/engine/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def __init__(
pooling_params: Optional[PoolingParams] = None,
multimodal_inputs: Optional[dict] = None,
multimodal_data: Optional[dict] = None,
mm_processor_kwargs: Optional[dict] = None,
disable_chat_template: bool = False,
disaggregate_info: Optional[dict] = None,
draft_token_ids: Optional[list[int]] = None,
Expand Down Expand Up @@ -132,6 +133,7 @@ def __init__(
# Multi-modal related
self.multimodal_inputs = multimodal_inputs
self.multimodal_data = multimodal_data
self.mm_processor_kwargs = mm_processor_kwargs
self.multimodal_img_boundaries = None

self.enable_thinking = enable_thinking
Expand Down Expand Up @@ -224,6 +226,7 @@ def from_dict(cls, d: dict):
eos_token_ids=d.get("eos_token_ids"),
multimodal_inputs=d.get("multimodal_inputs"),
multimodal_data=d.get("multimodal_data"),
mm_processor_kwargs=d.get("mm_processor_kwargs"),
disable_chat_template=d.get("disable_chat_template"),
disaggregate_info=d.get("disaggregate_info"),
draft_token_ids=d.get("draft_token_ids"),
Expand Down Expand Up @@ -296,6 +299,7 @@ def to_dict(self) -> dict:
"tools": self.tools,
"eos_token_ids": self.eos_token_ids,
"multimodal_data": self.multimodal_data,
"mm_processor_kwargs": self.mm_processor_kwargs,
"disable_chat_template": self.disable_chat_template,
"disaggregate_info": self.disaggregate_info,
"draft_token_ids": self.draft_token_ids,
Expand Down
1 change: 1 addition & 0 deletions fastdeploy/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -673,6 +673,7 @@ class ChatCompletionRequest(BaseModel):
guided_regex: Optional[str] = None
guided_choice: Optional[list[str]] = None
guided_grammar: Optional[str] = None
mm_processor_kwargs: Optional[dict] = None

return_token_ids: Optional[bool] = None
prompt_token_ids: Optional[List[int]] = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -217,12 +217,13 @@ def process_request_dict(self, request, max_model_len=None):
bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
request["bad_words_token_ids"] = bad_words_token_ids

processor_kwargs = self._parse_processor_kwargs(request.get("mm_processor_kwargs"))
if request.get("prompt_token_ids"):
messages = request.get("messages")
if messages:
self._check_mm_limits(messages)
request.setdefault("enable_thinking", True)
outputs = self.ernie4_5_processor.prompt_token_ids2outputs(request)
outputs = self.ernie4_5_processor.prompt_token_ids2outputs(request, **processor_kwargs)
elif request.get("prompt"):
multimodal_data = request.get("multimodal_data")
if multimodal_data is None:
Expand All @@ -231,7 +232,7 @@ def process_request_dict(self, request, max_model_len=None):
images = multimodal_data.get("image", None)
videos = multimodal_data.get("video", None)
request["prompt_tokens"] = request.get("prompt")
outputs = self.ernie4_5_processor.text2ids(request["prompt"], images, videos)
outputs = self.ernie4_5_processor.text2ids(request["prompt"], images, videos, **processor_kwargs)
elif request.get("messages"):
messages = request["messages"]
self._check_mm_limits(messages)
Expand All @@ -243,7 +244,7 @@ def process_request_dict(self, request, max_model_len=None):
request[k] = v
else:
raise ValueError("Invalid input: chat_template_kwargs must be a dict")
outputs = self.ernie4_5_processor.request2ids(request)
outputs = self.ernie4_5_processor.request2ids(request, **processor_kwargs)
else:
raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")

Expand Down
12 changes: 8 additions & 4 deletions fastdeploy/input/ernie4_5_vl_processor/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
from fastdeploy.input.utils import IDS_TYPE_FLAG
from fastdeploy.multimodal.hasher import MultimodalHasher
from fastdeploy.multimodal.utils import set_processor_kwargs
from fastdeploy.utils import data_processor_logger

from .image_preprocessor.image_preprocessor_adaptive import AdaptiveImageProcessor
Expand Down Expand Up @@ -172,7 +173,8 @@ def eval(self) -> None:
"""Enable evaluation mode (doesn't produce labels)."""
self.is_training = False

def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None):
@set_processor_kwargs
def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None, **kwargs):
"""
Convert chat text into model inputs.

Expand Down Expand Up @@ -293,15 +295,16 @@ def extract_mm_items(self, request: Dict[str, Any]):
raise ValueError(f"Unsupported multimodal type: {item.get('type')}")
return images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items

@set_processor_kwargs
def request2ids(
self, request: Dict[str, Any], tgts: List[str] = None
self, request: Dict[str, Any], tgts: List[str] = None, **kwargs
) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
"""
Convert chat messages into model inputs.
Returns a dict with input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels.
"""
images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = self.extract_mm_items(request)

if self.tokenizer.chat_template is None:
raise ValueError("This model does not support chat template.")

Expand Down Expand Up @@ -335,8 +338,9 @@ def request2ids(

return outputs

@set_processor_kwargs
def prompt_token_ids2outputs(
self, request: Dict[str, Any], tgts: List[str] = None
self, request: Dict[str, Any], tgts: List[str] = None, **kwargs
) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
outputs = {
"input_ids": [],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,20 +214,19 @@ def process_request_dict(self, request, max_model_len=None):
# processing stop_sequences and stop_token_ids
process_stop_token_ids(request, self.update_stop_seq)

processor_kwargs = self._parse_processor_kwargs(request.get("mm_processor_kwargs"))
if request.get("prompt"):
multimodal_data = request.get("multimodal_data")
if multimodal_data is None:
multimodal_data = {}
self._check_mm_limits(multimodal_data)
images = multimodal_data.get("image", None)
videos = multimodal_data.get("video", None)
outputs = self.processor.text2ids(request["prompt"], images, videos)

outputs = self.processor.text2ids(request["prompt"], images, videos, **processor_kwargs)
elif request.get("messages"):
messages = request["messages"]
self._check_mm_limits(messages)
outputs = self.processor.request2ids(request)

outputs = self.processor.request2ids(request, **processor_kwargs)
else:
raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")

Expand Down
7 changes: 5 additions & 2 deletions fastdeploy/input/paddleocr_vl_processor/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
from fastdeploy.input.utils import IDS_TYPE_FLAG
from fastdeploy.multimodal.hasher import MultimodalHasher
from fastdeploy.multimodal.utils import set_processor_kwargs
from fastdeploy.utils import data_processor_logger

from .image_processor import ImageProcessor
Expand Down Expand Up @@ -112,7 +113,8 @@ def __init__(
"assistant": "Assistant: ",
}

def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None):
@set_processor_kwargs
def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None, **kwargs):
"""
Convert text with image/video placeholders into model inputs.

Expand Down Expand Up @@ -198,8 +200,9 @@ def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=N

return outputs

@set_processor_kwargs
def request2ids(
self, request: Dict[str, Any], tgts: List[str] = None
self, request: Dict[str, Any], tgts: List[str] = None, **kwargs
) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
"""
Convert chat request with multimodal messages into model inputs.
Expand Down
7 changes: 5 additions & 2 deletions fastdeploy/input/qwen_vl_processor/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
from fastdeploy.input.utils import IDS_TYPE_FLAG
from fastdeploy.multimodal.hasher import MultimodalHasher
from fastdeploy.multimodal.utils import set_processor_kwargs
from fastdeploy.utils import data_processor_logger

from .image_processor import ImageProcessor
Expand Down Expand Up @@ -111,7 +112,8 @@ def __init__(
"assistant": "Assistant: ",
}

def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None):
@set_processor_kwargs
def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None, **kwargs):
"""
Convert text with image/video placeholders into model inputs.

Expand Down Expand Up @@ -195,8 +197,9 @@ def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=N

return outputs

@set_processor_kwargs
def request2ids(
self, request: Dict[str, Any], tgts: List[str] = None
self, request: Dict[str, Any], tgts: List[str] = None, **kwargs
) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
"""
Convert chat request with multimodal messages into model inputs.
Expand Down
6 changes: 3 additions & 3 deletions fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,15 +219,15 @@ def process_request_dict(self, request, max_model_len=None):
bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
request["bad_words_token_ids"] = bad_words_token_ids

processor_kwargs = self._parse_processor_kwargs(request.get("mm_processor_kwargs"))
if request.get("prompt"):
multimodal_data = request.get("multimodal_data")
if multimodal_data is None:
multimodal_data = {}
self._check_mm_limits(multimodal_data)
images = multimodal_data.get("image", None)
videos = multimodal_data.get("video", None)
outputs = self.processor.text2ids(request["prompt"], images, videos)

outputs = self.processor.text2ids(request["prompt"], images, videos, **processor_kwargs)
elif request.get("messages"):
messages = request["messages"]
self._check_mm_limits(messages)
Expand All @@ -240,7 +240,7 @@ def process_request_dict(self, request, max_model_len=None):
else:
raise ValueError("Invalid input: chat_template_kwargs must be a dict")
request.setdefault("enable_thinking", False)
outputs = self.processor.request2ids(request)
outputs = self.processor.request2ids(request, **processor_kwargs)

else:
raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
Expand Down
24 changes: 24 additions & 0 deletions fastdeploy/multimodal/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,27 @@ def _convert_transparent_paste(image):
pass

return ImageOps.exif_transpose(image)

def set_processor_kwargs(func):
"""
set processor kwargs by requests
"""
def wrapper(self, *args, **kwargs):
original_kwargs = {}
for k, v in kwargs.items():
if hasattr(self, k):
original_kwargs[k] = getattr(self, k)
setattr(self, k, v)
elif hasattr(self, k.replace("video_", "")):
k = k.replace("video_", "")
original_kwargs[k] = getattr(self, k)
setattr(self, k, v)

ret = func(self, *args, **kwargs)

for k, v in original_kwargs.items():
setattr(self, k, v)

return ret
return wrapper

Loading