Skip to content

vllm.model_executor.models.transformers.multimodal

Transformers modeling backend mixin for multi-modal models.

DYNAMIC_ARG_DIMS module-attribute

DYNAMIC_ARG_DIMS = {
    "input_ids": 0,
    "positions": -1,
    "intermediate_tensors": 0,
    "inputs_embeds": 0,
}

MultiModalDummyInputsBuilder

Bases: BaseDummyInputsBuilder[MultiModalProcessingInfo]

Source code in vllm/model_executor/models/transformers/multimodal.py
class MultiModalDummyInputsBuilder(BaseDummyInputsBuilder[MultiModalProcessingInfo]):
    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
        num_images = mm_counts.get("image", 0)
        num_videos = mm_counts.get("video", 0)

        processor = self.info.get_hf_processor()
        if "gemma3" in processor.__class__.__name__.lower():
            image_token = processor.boi_token
        else:
            image_token = getattr(processor, "image_token", "")
        video_token = getattr(processor, "video_token", "")
        return image_token * num_images + video_token * num_videos

    def get_dummy_mm_data(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
        mm_options: Mapping[str, "BaseDummyOptions"] | None = None,
    ) -> MultiModalDataDict:
        num_images = mm_counts.get("image", 0)
        num_videos = mm_counts.get("video", 0)

        target_width, target_height = self.info.get_max_image_size()
        max_total_frames = self.info.get_max_video_frames(seq_len)
        target_num_frames = max_total_frames // max(num_videos, 1)

        image_overrides = mm_options.get("image") if mm_options else None
        video_overrides = mm_options.get("video") if mm_options else None

        return {
            "image": self._get_dummy_images(
                width=target_width,
                height=target_height,
                num_images=num_images,
                overrides=image_overrides,
            ),
            "video": self._get_dummy_videos(
                width=target_width,
                height=target_height,
                num_frames=target_num_frames,
                num_videos=num_videos,
                overrides=video_overrides,
            ),
        }

get_dummy_mm_data

get_dummy_mm_data(
    seq_len: int,
    mm_counts: Mapping[str, int],
    mm_options: Mapping[str, BaseDummyOptions]
    | None = None,
) -> MultiModalDataDict
Source code in vllm/model_executor/models/transformers/multimodal.py
def get_dummy_mm_data(
    self,
    seq_len: int,
    mm_counts: Mapping[str, int],
    mm_options: Mapping[str, "BaseDummyOptions"] | None = None,
) -> MultiModalDataDict:
    num_images = mm_counts.get("image", 0)
    num_videos = mm_counts.get("video", 0)

    target_width, target_height = self.info.get_max_image_size()
    max_total_frames = self.info.get_max_video_frames(seq_len)
    target_num_frames = max_total_frames // max(num_videos, 1)

    image_overrides = mm_options.get("image") if mm_options else None
    video_overrides = mm_options.get("video") if mm_options else None

    return {
        "image": self._get_dummy_images(
            width=target_width,
            height=target_height,
            num_images=num_images,
            overrides=image_overrides,
        ),
        "video": self._get_dummy_videos(
            width=target_width,
            height=target_height,
            num_frames=target_num_frames,
            num_videos=num_videos,
            overrides=video_overrides,
        ),
    }

get_dummy_text

get_dummy_text(mm_counts: Mapping[str, int]) -> str
Source code in vllm/model_executor/models/transformers/multimodal.py
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
    num_images = mm_counts.get("image", 0)
    num_videos = mm_counts.get("video", 0)

    processor = self.info.get_hf_processor()
    if "gemma3" in processor.__class__.__name__.lower():
        image_token = processor.boi_token
    else:
        image_token = getattr(processor, "image_token", "")
    video_token = getattr(processor, "video_token", "")
    return image_token * num_images + video_token * num_videos

MultiModalMixin

Bases: SupportsMultiModal, SupportsMRoPE

Source code in vllm/model_executor/models/transformers/multimodal.py
class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
    supports_multimodal_raw_input_only = True

    # Backwards compatibility for prev released models. State dicts back then
    # had different formats and cannot be loaded with `AutoModel` mapping as is
    hf_to_vllm_mapper = WeightsMapper(
        orig_to_new_prefix={
            "language_model.model": "model.language_model",
            "text_model.model": "model.text_model",
            "vision_tower": "model.vision_tower",
            "vqmodel": "model.vqmodel",
            "visual": "model.visual",
            "vision_model": "model.vision_model",
            "vision_embed_tokens": "model.vision_embed_tokens",
            "image_newline": "model.image_newline",
            "multi_modal_projector": "model.multi_modal_projector",
            "text_model.lm_head": "lm_head",
            "language_model.lm_head": "lm_head",
            # Qwen models used "model" as the name for the language model.
            # Therefore, we must map each of submodule explicitly to avoid
            # conflicts with newer models that use "model.language_model".
            "model.embed_tokens": "model.language_model.embed_tokens",
            "model.layers": "model.language_model.layers",
            "model.norm": "model.language_model.norm",
        }
    )

    def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
        # Skip SupportsMRoPE.__init__ and call the next class in MRO
        super(SupportsMRoPE, self).__init__(vllm_config=vllm_config, prefix=prefix)

    def forward(
        self,
        input_ids: torch.Tensor | None,
        positions: torch.Tensor,
        intermediate_tensors: IntermediateTensors | None = None,
        inputs_embeds: torch.Tensor | None = None,
        **kwargs: object,
    ) -> torch.Tensor | IntermediateTensors:
        # Gemma3 and PaliGemma needs `token_type_ids` to work correctly
        # Other models will not have `token_type_ids` in kwargs
        kwargs = {k: v for k, v in kwargs.items() if k == "token_type_ids"}
        model_output = super().forward(
            input_ids, positions, intermediate_tensors, inputs_embeds, **kwargs
        )
        return model_output

    def get_language_model(self) -> torch.nn.Module:
        """Transformers modeling backend multimodal classes do not contain a separate
        vLLM language model class. Therefore, in order to return a language model vLLM
        class, we use a wrapper to give `self` the same interface as a text model."""

        # Exclude self and object
        bases = self.__class__.mro()[1:-1]
        # Keep only classes defined in `vllm.model_executor.models.transformers`
        bases = [b for b in bases if ".transformers." in b.__module__]
        # Exclude MultiModalMixin itself
        bases = [b for b in bases if b is not MultiModalMixin]

        class LanguageModel(*bases):
            def __init__(self, multimodal_model):
                # Don't call super().__init__() to avoid re-initialization
                self.__dict__.update(multimodal_model.__dict__)

            model = getattr_iter(self.model, ("language_model", "text_model"), None)

        return LanguageModel(self)

    def embed_multimodal(self, **kwargs) -> MultiModalEmbeddings:
        pixel_values: torch.Tensor | None = kwargs.pop("pixel_values", None)
        image_embeds: torch.Tensor | None = kwargs.pop("image_embeds", None)
        pixel_values_videos: torch.Tensor | None = kwargs.pop(
            "pixel_values_videos", None
        )
        video_embeds: torch.Tensor | None = kwargs.pop("video_embeds", None)

        # Model might use `image_patches` instead of `pixel_values`
        if pixel_values is None:
            pixel_values = kwargs.pop("image_patches", None)

        multimodal_embeddings: list[torch.Tensor] = []

        if image_embeds is not None:
            multimodal_embeddings += image_embeds

        kwargs.pop("token_type_ids", None)  # used only in `forward`
        num_image_patches = kwargs.pop("num_image_patches", None)
        num_video_patches = kwargs.pop("num_video_patches", None)

        if pixel_values is not None:
            vision_embeddings = self.model.get_image_features(pixel_values, **kwargs)

            if isinstance(vision_embeddings, tuple):
                # For qwen3 vl, The deepstack visual features are also returned
                vision_embeddings = vision_embeddings[0]
            if isinstance(vision_embeddings, torch.Tensor):
                if vision_embeddings.ndim == 2:
                    vision_embeddings = vision_embeddings.unsqueeze(0)

                # Embeddings have to be 2D tensors of length `num_images`
                # but transformers returns concat tensors if each patch
                # is of different size. We split it back to make vLLM happy
                vision_embeddings = torch.split(
                    vision_embeddings, num_image_patches.flatten().tolist()
                )
                vision_embeddings = [
                    embed.flatten(start_dim=0, end_dim=-2)
                    for embed in vision_embeddings
                ]
            multimodal_embeddings += vision_embeddings

        if video_embeds is not None:
            multimodal_embeddings += video_embeds

        if pixel_values_videos is not None:
            vision_embeddings = self.model.get_video_features(
                pixel_values_videos, **kwargs
            )

            if isinstance(vision_embeddings, tuple):
                # For qwen3 vl, The deepstack visual features are also returned
                vision_embeddings = vision_embeddings[0]
            if isinstance(vision_embeddings, torch.Tensor):
                if vision_embeddings.ndim == 2:
                    vision_embeddings = vision_embeddings.unsqueeze(0)

                # Embeddings have to be 2D tensors of length `num_images`
                # but transformers returns concat tensors if each patch
                # is of different size. We split it back to make vLLM happy
                vision_embeddings = torch.split(
                    vision_embeddings, num_video_patches.flatten().tolist()
                )
                vision_embeddings = [
                    embed.flatten(start_dim=0, end_dim=-2)
                    for embed in vision_embeddings
                ]
            multimodal_embeddings += vision_embeddings

        return multimodal_embeddings

    def get_mrope_input_positions(
        self,
        input_tokens: list[int],
        mm_features: list[MultiModalFeatureSpec],
    ) -> tuple[torch.Tensor, int]:
        kwargs = MultiModalFeatureSpec.gather_kwargs(
            mm_features,
            {
                "image_grid_thw",
                "video_grid_thw",
                "second_per_grid_ts",
                "audio_feature_lengths",
                "use_audio_in_video",
            },
        )
        if any(
            v
            for k, v in kwargs.items()
            if k not in {"image_grid_thw", "video_grid_thw"}
        ):
            raise NotImplementedError(
                "Transformers modeling backend only supports images and videos."
            )

        image_grid_thw = kwargs.get("image_grid_thw", None)
        video_grid_thw = kwargs.get("video_grid_thw", None)

        image_grid_thw = torch.stack(image_grid_thw) if image_grid_thw else None
        video_grid_thw = torch.stack(video_grid_thw) if video_grid_thw else None

        mrope_positions, mrope_position_delta = self.model.get_rope_index(
            input_ids=torch.tensor(input_tokens).unsqueeze(0),
            image_grid_thw=image_grid_thw,
            video_grid_thw=video_grid_thw,
        )

        mrope_positions = mrope_positions[:, 0]
        mrope_position_delta = mrope_position_delta[0].item()

        return mrope_positions, mrope_position_delta

hf_to_vllm_mapper class-attribute instance-attribute

hf_to_vllm_mapper = WeightsMapper(
    orig_to_new_prefix={
        "language_model.model": "model.language_model",
        "text_model.model": "model.text_model",
        "vision_tower": "model.vision_tower",
        "vqmodel": "model.vqmodel",
        "visual": "model.visual",
        "vision_model": "model.vision_model",
        "vision_embed_tokens": "model.vision_embed_tokens",
        "image_newline": "model.image_newline",
        "multi_modal_projector": "model.multi_modal_projector",
        "text_model.lm_head": "lm_head",
        "language_model.lm_head": "lm_head",
        "model.embed_tokens": "model.language_model.embed_tokens",
        "model.layers": "model.language_model.layers",
        "model.norm": "model.language_model.norm",
    }
)

supports_multimodal_raw_input_only class-attribute instance-attribute

supports_multimodal_raw_input_only = True

__init__

__init__(*, vllm_config: VllmConfig, prefix: str = '')
Source code in vllm/model_executor/models/transformers/multimodal.py
def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
    # Skip SupportsMRoPE.__init__ and call the next class in MRO
    super(SupportsMRoPE, self).__init__(vllm_config=vllm_config, prefix=prefix)

embed_multimodal

embed_multimodal(**kwargs) -> MultiModalEmbeddings
Source code in vllm/model_executor/models/transformers/multimodal.py
def embed_multimodal(self, **kwargs) -> MultiModalEmbeddings:
    pixel_values: torch.Tensor | None = kwargs.pop("pixel_values", None)
    image_embeds: torch.Tensor | None = kwargs.pop("image_embeds", None)
    pixel_values_videos: torch.Tensor | None = kwargs.pop(
        "pixel_values_videos", None
    )
    video_embeds: torch.Tensor | None = kwargs.pop("video_embeds", None)

    # Model might use `image_patches` instead of `pixel_values`
    if pixel_values is None:
        pixel_values = kwargs.pop("image_patches", None)

    multimodal_embeddings: list[torch.Tensor] = []

    if image_embeds is not None:
        multimodal_embeddings += image_embeds

    kwargs.pop("token_type_ids", None)  # used only in `forward`
    num_image_patches = kwargs.pop("num_image_patches", None)
    num_video_patches = kwargs.pop("num_video_patches", None)

    if pixel_values is not None:
        vision_embeddings = self.model.get_image_features(pixel_values, **kwargs)

        if isinstance(vision_embeddings, tuple):
            # For qwen3 vl, The deepstack visual features are also returned
            vision_embeddings = vision_embeddings[0]
        if isinstance(vision_embeddings, torch.Tensor):
            if vision_embeddings.ndim == 2:
                vision_embeddings = vision_embeddings.unsqueeze(0)

            # Embeddings have to be 2D tensors of length `num_images`
            # but transformers returns concat tensors if each patch
            # is of different size. We split it back to make vLLM happy
            vision_embeddings = torch.split(
                vision_embeddings, num_image_patches.flatten().tolist()
            )
            vision_embeddings = [
                embed.flatten(start_dim=0, end_dim=-2)
                for embed in vision_embeddings
            ]
        multimodal_embeddings += vision_embeddings

    if video_embeds is not None:
        multimodal_embeddings += video_embeds

    if pixel_values_videos is not None:
        vision_embeddings = self.model.get_video_features(
            pixel_values_videos, **kwargs
        )

        if isinstance(vision_embeddings, tuple):
            # For qwen3 vl, The deepstack visual features are also returned
            vision_embeddings = vision_embeddings[0]
        if isinstance(vision_embeddings, torch.Tensor):
            if vision_embeddings.ndim == 2:
                vision_embeddings = vision_embeddings.unsqueeze(0)

            # Embeddings have to be 2D tensors of length `num_images`
            # but transformers returns concat tensors if each patch
            # is of different size. We split it back to make vLLM happy
            vision_embeddings = torch.split(
                vision_embeddings, num_video_patches.flatten().tolist()
            )
            vision_embeddings = [
                embed.flatten(start_dim=0, end_dim=-2)
                for embed in vision_embeddings
            ]
        multimodal_embeddings += vision_embeddings

    return multimodal_embeddings

forward

forward(
    input_ids: Tensor | None,
    positions: Tensor,
    intermediate_tensors: IntermediateTensors | None = None,
    inputs_embeds: Tensor | None = None,
    **kwargs: object,
) -> Tensor | IntermediateTensors
Source code in vllm/model_executor/models/transformers/multimodal.py
def forward(
    self,
    input_ids: torch.Tensor | None,
    positions: torch.Tensor,
    intermediate_tensors: IntermediateTensors | None = None,
    inputs_embeds: torch.Tensor | None = None,
    **kwargs: object,
) -> torch.Tensor | IntermediateTensors:
    # Gemma3 and PaliGemma needs `token_type_ids` to work correctly
    # Other models will not have `token_type_ids` in kwargs
    kwargs = {k: v for k, v in kwargs.items() if k == "token_type_ids"}
    model_output = super().forward(
        input_ids, positions, intermediate_tensors, inputs_embeds, **kwargs
    )
    return model_output

get_language_model

get_language_model() -> Module

Transformers modeling backend multimodal classes do not contain a separate vLLM language model class. Therefore, in order to return a language model vLLM class, we use a wrapper to give self the same interface as a text model.

Source code in vllm/model_executor/models/transformers/multimodal.py
def get_language_model(self) -> torch.nn.Module:
    """Transformers modeling backend multimodal classes do not contain a separate
    vLLM language model class. Therefore, in order to return a language model vLLM
    class, we use a wrapper to give `self` the same interface as a text model."""

    # Exclude self and object
    bases = self.__class__.mro()[1:-1]
    # Keep only classes defined in `vllm.model_executor.models.transformers`
    bases = [b for b in bases if ".transformers." in b.__module__]
    # Exclude MultiModalMixin itself
    bases = [b for b in bases if b is not MultiModalMixin]

    class LanguageModel(*bases):
        def __init__(self, multimodal_model):
            # Don't call super().__init__() to avoid re-initialization
            self.__dict__.update(multimodal_model.__dict__)

        model = getattr_iter(self.model, ("language_model", "text_model"), None)

    return LanguageModel(self)

get_mrope_input_positions

get_mrope_input_positions(
    input_tokens: list[int],
    mm_features: list[MultiModalFeatureSpec],
) -> tuple[Tensor, int]
Source code in vllm/model_executor/models/transformers/multimodal.py
def get_mrope_input_positions(
    self,
    input_tokens: list[int],
    mm_features: list[MultiModalFeatureSpec],
) -> tuple[torch.Tensor, int]:
    kwargs = MultiModalFeatureSpec.gather_kwargs(
        mm_features,
        {
            "image_grid_thw",
            "video_grid_thw",
            "second_per_grid_ts",
            "audio_feature_lengths",
            "use_audio_in_video",
        },
    )
    if any(
        v
        for k, v in kwargs.items()
        if k not in {"image_grid_thw", "video_grid_thw"}
    ):
        raise NotImplementedError(
            "Transformers modeling backend only supports images and videos."
        )

    image_grid_thw = kwargs.get("image_grid_thw", None)
    video_grid_thw = kwargs.get("video_grid_thw", None)

    image_grid_thw = torch.stack(image_grid_thw) if image_grid_thw else None
    video_grid_thw = torch.stack(video_grid_thw) if video_grid_thw else None

    mrope_positions, mrope_position_delta = self.model.get_rope_index(
        input_ids=torch.tensor(input_tokens).unsqueeze(0),
        image_grid_thw=image_grid_thw,
        video_grid_thw=video_grid_thw,
    )

    mrope_positions = mrope_positions[:, 0]
    mrope_position_delta = mrope_position_delta[0].item()

    return mrope_positions, mrope_position_delta

MultiModalProcessingInfo

Bases: BaseProcessingInfo

Source code in vllm/model_executor/models/transformers/multimodal.py
class MultiModalProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self):
        return {"image": None, "video": None}

    def get_mm_max_tokens_per_item(self, seq_len, mm_counts):
        return {
            "image": self.get_max_image_tokens(),
            "video": self.get_max_video_tokens(seq_len),
        }

    def get_max_image_tokens(self) -> int:
        width, height = self.get_max_image_size()
        processor = self.get_hf_processor()
        multimodal_config = self.ctx.model_config.multimodal_config
        mm_processor_kwargs = multimodal_config.mm_processor_kwargs or {}
        mm_tokens = processor._get_num_multimodal_tokens(
            image_sizes=([height, width],), **mm_processor_kwargs
        )
        image_tokens = mm_tokens["num_image_tokens"][0]
        return image_tokens

    def _get_video_tokens(self, num_frames, width, height) -> int:
        processor = self.get_hf_processor()
        multimodal_config = self.ctx.model_config.multimodal_config
        mm_processor_kwargs = multimodal_config.mm_processor_kwargs or {}
        mm_tokens = processor._get_num_multimodal_tokens(
            video_sizes=([num_frames, height, width],), **mm_processor_kwargs
        )
        video_tokens = mm_tokens["num_video_tokens"][0]
        return video_tokens

    def get_max_video_tokens(self, seq_len: int) -> int:
        width, height = self.get_max_image_size()
        num_frames = self.get_max_video_frames(seq_len)
        return self._get_video_tokens(num_frames, width, height)

    def get_max_image_size(self):
        return 10_000, 10_000  # hardcode for arbitrary very large size

    def get_max_video_frames(self, seq_len: int) -> int:
        width, height = self.get_max_image_size()

        max_num_frames = 1

        while True:
            next_num_frames = max_num_frames + 1
            video_tokens = self._get_video_tokens(next_num_frames, width, height)
            if video_tokens > seq_len:
                break

            max_num_frames = next_num_frames

        return max_num_frames

_get_video_tokens

_get_video_tokens(num_frames, width, height) -> int
Source code in vllm/model_executor/models/transformers/multimodal.py
def _get_video_tokens(self, num_frames, width, height) -> int:
    processor = self.get_hf_processor()
    multimodal_config = self.ctx.model_config.multimodal_config
    mm_processor_kwargs = multimodal_config.mm_processor_kwargs or {}
    mm_tokens = processor._get_num_multimodal_tokens(
        video_sizes=([num_frames, height, width],), **mm_processor_kwargs
    )
    video_tokens = mm_tokens["num_video_tokens"][0]
    return video_tokens

get_max_image_size

get_max_image_size()
Source code in vllm/model_executor/models/transformers/multimodal.py
def get_max_image_size(self):
    return 10_000, 10_000  # hardcode for arbitrary very large size

get_max_image_tokens

get_max_image_tokens() -> int
Source code in vllm/model_executor/models/transformers/multimodal.py
def get_max_image_tokens(self) -> int:
    width, height = self.get_max_image_size()
    processor = self.get_hf_processor()
    multimodal_config = self.ctx.model_config.multimodal_config
    mm_processor_kwargs = multimodal_config.mm_processor_kwargs or {}
    mm_tokens = processor._get_num_multimodal_tokens(
        image_sizes=([height, width],), **mm_processor_kwargs
    )
    image_tokens = mm_tokens["num_image_tokens"][0]
    return image_tokens

get_max_video_frames

get_max_video_frames(seq_len: int) -> int
Source code in vllm/model_executor/models/transformers/multimodal.py
def get_max_video_frames(self, seq_len: int) -> int:
    width, height = self.get_max_image_size()

    max_num_frames = 1

    while True:
        next_num_frames = max_num_frames + 1
        video_tokens = self._get_video_tokens(next_num_frames, width, height)
        if video_tokens > seq_len:
            break

        max_num_frames = next_num_frames

    return max_num_frames

get_max_video_tokens

get_max_video_tokens(seq_len: int) -> int
Source code in vllm/model_executor/models/transformers/multimodal.py
def get_max_video_tokens(self, seq_len: int) -> int:
    width, height = self.get_max_image_size()
    num_frames = self.get_max_video_frames(seq_len)
    return self._get_video_tokens(num_frames, width, height)

get_mm_max_tokens_per_item

get_mm_max_tokens_per_item(seq_len, mm_counts)
Source code in vllm/model_executor/models/transformers/multimodal.py
def get_mm_max_tokens_per_item(self, seq_len, mm_counts):
    return {
        "image": self.get_max_image_tokens(),
        "video": self.get_max_video_tokens(seq_len),
    }

get_supported_mm_limits

get_supported_mm_limits()
Source code in vllm/model_executor/models/transformers/multimodal.py
def get_supported_mm_limits(self):
    return {"image": None, "video": None}

MultiModalProcessor

Bases: BaseMultiModalProcessor[MultiModalProcessingInfo]

Source code in vllm/model_executor/models/transformers/multimodal.py
class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
    def _get_prompt_updates(
        self,
        mm_items: MultiModalDataItems,
        hf_processor_mm_kwargs: Mapping[str, object],
        out_mm_kwargs: MultiModalKwargsItems,
    ):
        """
        Given the original multi-modal items for this modality
        and HF-processed data, output the updates to perform.

        The information returned by this method is used to update token inputs
        which bypass the HF processor. It is also used to update the output of
        HF processor if the HF process does not apply prompt updates to text
        inputs.

        Moreover, this information is critical to determine the token positions
        in order to construct  :class:`~vllm-multimodal.input.PlaceholderRange`
        for each multi-modal item.
        """
        return None

    def _get_mm_fields_config(
        self,
        hf_inputs: "BatchFeature",
        hf_processor_mm_kwargs: Mapping[str, object],
    ) -> Mapping[str, MultiModalFieldConfig]:
        # HF Processors always return a mask but vLLM doesn't need it
        hf_inputs.pop("attention_mask", None)
        num_image_patches = hf_inputs.get("num_image_patches")
        mm_fields = {
            key: MultiModalFieldConfig.flat_from_sizes("image", num_image_patches)
            for key in hf_inputs
        }
        mm_fields["image_embeds"] = MultiModalFieldConfig.flat_from_sizes(
            "image", num_image_patches
        )

        # Keep these as batched, as they always have batch size as first dim
        mm_fields["image_grid_thw"] = MultiModalFieldConfig.batched("image")
        mm_fields["num_image_patches"] = MultiModalFieldConfig.batched("image")

        video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
        video_grid_sizes = video_grid_thw.prod(-1)
        mm_fields["pixel_values_videos"] = MultiModalFieldConfig.flat_from_sizes(
            "video", video_grid_sizes
        )
        mm_fields["video_embeds"] = MultiModalFieldConfig.flat_from_sizes(
            "video", video_grid_sizes
        )
        mm_fields["video_grid_thw"] = MultiModalFieldConfig.batched("video")
        mm_fields["num_video_patches"] = MultiModalFieldConfig.batched("video")
        return mm_fields

    def _get_hf_mm_data(
        self,
        mm_items: MultiModalDataItems,
    ) -> tuple[Mapping[str, object], Mapping[str, object]]:
        """
        In contrast to the base class, this method always adds
        `return_mm_token_type_ids` to the processor data
        """
        processor_data, passthrough_data = super()._get_hf_mm_data(mm_items)
        processor_data["return_mm_token_type_ids"] = True
        return processor_data, passthrough_data

    def apply(
        self,
        prompt: str | list[int],
        mm_data: MultiModalDataDict,
        hf_processor_mm_kwargs: Mapping[str, object],
        tokenization_kwargs: Mapping[str, object] | None = None,
        mm_uuids: MultiModalUUIDDict | None = None,
    ) -> MultiModalInputs:
        """
        Process multi-modal inputs to be used in vLLM.

        Apply HF Processor on prompt text and multi-modal data together,
        outputting token IDs and processed tensors.
        """
        if tokenization_kwargs is None:
            tokenization_kwargs = {}

        mm_items = self._to_mm_items(mm_data)
        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
        if not isinstance(prompt, str):
            # the prompt is the tokenized ids which is not supported
            # by the hf_processor, which is why we would need to decode the ids
            # into string
            prompt = hf_processor.decode(prompt)

        # Bypass cached processor and always apply to the full set of mm inputs
        # NOTE: we can't just set caching=False because base class method
        # transforms outputs to `MultiModalKwargs` which is not going to
        # work for Transformers. We have a lot of logic tied to
        # `mm_tokens_per_modality` below
        prompt_ids, processed_data, _ = self._apply_hf_processor_text_mm(
            prompt_text=prompt,
            mm_items=mm_items,
            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
            tokenization_kwargs=tokenization_kwargs,
        )

        # For gemma3 we check `token_type_ids` as the key
        token_type_key = (
            "mm_token_type_ids"
            if "mm_token_type_ids" in processed_data
            else "token_type_ids"
        )
        mm_token_type_ids = processed_data.pop(token_type_key)

        # We can infer vLLM style placeholder from token type ids, if we split
        # it for each input `mm_data`.
        image_sizes = []
        if "image" in mm_items:
            images = mm_items.get_items("image", ImageProcessorItems)
            for item_idx in range(len(images)):
                image_size = images.get_image_size(item_idx)
                image_sizes.append((image_size.height, image_size.width))

        video_sizes = []
        if "video" in mm_items:
            videos = mm_items.get_items("video", VideoProcessorItems)
            for item_idx in range(len(videos)):
                video_size = videos.get_frame_size(item_idx)
                num_frames = videos.get_num_frames(item_idx)
                video_sizes.append((num_frames, video_size.height, video_size.width))

        multimodal_config = self.info.ctx.model_config.multimodal_config
        mm_processor_kwargs = multimodal_config.mm_processor_kwargs or {}

        mm_tokens_per_modality = hf_processor._get_num_multimodal_tokens(
            image_sizes=image_sizes, video_sizes=video_sizes, **mm_processor_kwargs
        )

        mm_placeholders = {}

        # image_token_ids
        mm_positions = torch.where(mm_token_type_ids == 1)[1]
        split_sizes = mm_tokens_per_modality["num_image_tokens"]
        if split_sizes:
            chunked_mm_positions = torch.split(mm_positions, split_sizes)
            mm_tokens = torch.tensor(prompt_ids)[mm_token_type_ids[0] == 1]
            chunked_mm_tokens = torch.split(mm_tokens, split_sizes)
            ranges = [
                PlaceholderRange(
                    offset=positions[0].item(),
                    length=positions.shape[0],
                    is_embed=(mm_tokens == hf_processor.image_token_id).bool(),
                )
                for positions, mm_tokens in zip(chunked_mm_positions, chunked_mm_tokens)
            ]
            mm_placeholders["image"] = ranges

        processed_data["num_image_patches"] = torch.tensor(
            mm_tokens_per_modality["num_image_patches"]
        )

        # video_token_ids
        mm_positions = torch.where(mm_token_type_ids == 2)[1]

        split_sizes = mm_tokens_per_modality["num_video_tokens"]
        if split_sizes:
            chunked_mm_positions = torch.split(mm_positions, split_sizes)
            mm_tokens = torch.tensor(prompt_ids)[mm_token_type_ids[0] == 2]
            chunked_mm_tokens = torch.split(mm_tokens, split_sizes)
            ranges = [
                PlaceholderRange(
                    offset=positions[0].item(),
                    length=positions.shape[0],
                    is_embed=(mm_tokens == hf_processor.video_token_id).bool(),
                )
                for positions, mm_tokens in zip(chunked_mm_positions, chunked_mm_tokens)
            ]
            mm_placeholders["video"] = ranges

        processed_data["num_video_patches"] = torch.tensor(
            mm_tokens_per_modality["num_video_patches"]
        )

        mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
            processed_data,
            self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs),
        )

        # Use overrides if provided; fallback to data-dependent hashing.
        mm_hashes = self._hash_mm_items(
            mm_items, hf_processor_mm_kwargs, tokenization_kwargs, mm_uuids=mm_uuids
        )

        return MultiModalInputs(
            type="multimodal",
            prompt_token_ids=prompt_ids,
            mm_kwargs=mm_kwargs,
            mm_hashes=mm_hashes,
            mm_placeholders=mm_placeholders,
        )

_get_hf_mm_data

_get_hf_mm_data(
    mm_items: MultiModalDataItems,
) -> tuple[Mapping[str, object], Mapping[str, object]]

In contrast to the base class, this method always adds return_mm_token_type_ids to the processor data

Source code in vllm/model_executor/models/transformers/multimodal.py
def _get_hf_mm_data(
    self,
    mm_items: MultiModalDataItems,
) -> tuple[Mapping[str, object], Mapping[str, object]]:
    """
    In contrast to the base class, this method always adds
    `return_mm_token_type_ids` to the processor data
    """
    processor_data, passthrough_data = super()._get_hf_mm_data(mm_items)
    processor_data["return_mm_token_type_ids"] = True
    return processor_data, passthrough_data

_get_mm_fields_config

_get_mm_fields_config(
    hf_inputs: BatchFeature,
    hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]
Source code in vllm/model_executor/models/transformers/multimodal.py
def _get_mm_fields_config(
    self,
    hf_inputs: "BatchFeature",
    hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
    # HF Processors always return a mask but vLLM doesn't need it
    hf_inputs.pop("attention_mask", None)
    num_image_patches = hf_inputs.get("num_image_patches")
    mm_fields = {
        key: MultiModalFieldConfig.flat_from_sizes("image", num_image_patches)
        for key in hf_inputs
    }
    mm_fields["image_embeds"] = MultiModalFieldConfig.flat_from_sizes(
        "image", num_image_patches
    )

    # Keep these as batched, as they always have batch size as first dim
    mm_fields["image_grid_thw"] = MultiModalFieldConfig.batched("image")
    mm_fields["num_image_patches"] = MultiModalFieldConfig.batched("image")

    video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
    video_grid_sizes = video_grid_thw.prod(-1)
    mm_fields["pixel_values_videos"] = MultiModalFieldConfig.flat_from_sizes(
        "video", video_grid_sizes
    )
    mm_fields["video_embeds"] = MultiModalFieldConfig.flat_from_sizes(
        "video", video_grid_sizes
    )
    mm_fields["video_grid_thw"] = MultiModalFieldConfig.batched("video")
    mm_fields["num_video_patches"] = MultiModalFieldConfig.batched("video")
    return mm_fields

_get_prompt_updates

_get_prompt_updates(
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    out_mm_kwargs: MultiModalKwargsItems,
)

Given the original multi-modal items for this modality and HF-processed data, output the updates to perform.

The information returned by this method is used to update token inputs which bypass the HF processor. It is also used to update the output of HF processor if the HF process does not apply prompt updates to text inputs.

Moreover, this information is critical to determine the token positions in order to construct :class:~vllm-multimodal.input.PlaceholderRange for each multi-modal item.

Source code in vllm/model_executor/models/transformers/multimodal.py
def _get_prompt_updates(
    self,
    mm_items: MultiModalDataItems,
    hf_processor_mm_kwargs: Mapping[str, object],
    out_mm_kwargs: MultiModalKwargsItems,
):
    """
    Given the original multi-modal items for this modality
    and HF-processed data, output the updates to perform.

    The information returned by this method is used to update token inputs
    which bypass the HF processor. It is also used to update the output of
    HF processor if the HF process does not apply prompt updates to text
    inputs.

    Moreover, this information is critical to determine the token positions
    in order to construct  :class:`~vllm-multimodal.input.PlaceholderRange`
    for each multi-modal item.
    """
    return None

apply

apply(
    prompt: str | list[int],
    mm_data: MultiModalDataDict,
    hf_processor_mm_kwargs: Mapping[str, object],
    tokenization_kwargs: Mapping[str, object] | None = None,
    mm_uuids: MultiModalUUIDDict | None = None,
) -> MultiModalInputs

Process multi-modal inputs to be used in vLLM.

Apply HF Processor on prompt text and multi-modal data together, outputting token IDs and processed tensors.

Source code in vllm/model_executor/models/transformers/multimodal.py
def apply(
    self,
    prompt: str | list[int],
    mm_data: MultiModalDataDict,
    hf_processor_mm_kwargs: Mapping[str, object],
    tokenization_kwargs: Mapping[str, object] | None = None,
    mm_uuids: MultiModalUUIDDict | None = None,
) -> MultiModalInputs:
    """
    Process multi-modal inputs to be used in vLLM.

    Apply HF Processor on prompt text and multi-modal data together,
    outputting token IDs and processed tensors.
    """
    if tokenization_kwargs is None:
        tokenization_kwargs = {}

    mm_items = self._to_mm_items(mm_data)
    hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
    if not isinstance(prompt, str):
        # the prompt is the tokenized ids which is not supported
        # by the hf_processor, which is why we would need to decode the ids
        # into string
        prompt = hf_processor.decode(prompt)

    # Bypass cached processor and always apply to the full set of mm inputs
    # NOTE: we can't just set caching=False because base class method
    # transforms outputs to `MultiModalKwargs` which is not going to
    # work for Transformers. We have a lot of logic tied to
    # `mm_tokens_per_modality` below
    prompt_ids, processed_data, _ = self._apply_hf_processor_text_mm(
        prompt_text=prompt,
        mm_items=mm_items,
        hf_processor_mm_kwargs=hf_processor_mm_kwargs,
        tokenization_kwargs=tokenization_kwargs,
    )

    # For gemma3 we check `token_type_ids` as the key
    token_type_key = (
        "mm_token_type_ids"
        if "mm_token_type_ids" in processed_data
        else "token_type_ids"
    )
    mm_token_type_ids = processed_data.pop(token_type_key)

    # We can infer vLLM style placeholder from token type ids, if we split
    # it for each input `mm_data`.
    image_sizes = []
    if "image" in mm_items:
        images = mm_items.get_items("image", ImageProcessorItems)
        for item_idx in range(len(images)):
            image_size = images.get_image_size(item_idx)
            image_sizes.append((image_size.height, image_size.width))

    video_sizes = []
    if "video" in mm_items:
        videos = mm_items.get_items("video", VideoProcessorItems)
        for item_idx in range(len(videos)):
            video_size = videos.get_frame_size(item_idx)
            num_frames = videos.get_num_frames(item_idx)
            video_sizes.append((num_frames, video_size.height, video_size.width))

    multimodal_config = self.info.ctx.model_config.multimodal_config
    mm_processor_kwargs = multimodal_config.mm_processor_kwargs or {}

    mm_tokens_per_modality = hf_processor._get_num_multimodal_tokens(
        image_sizes=image_sizes, video_sizes=video_sizes, **mm_processor_kwargs
    )

    mm_placeholders = {}

    # image_token_ids
    mm_positions = torch.where(mm_token_type_ids == 1)[1]
    split_sizes = mm_tokens_per_modality["num_image_tokens"]
    if split_sizes:
        chunked_mm_positions = torch.split(mm_positions, split_sizes)
        mm_tokens = torch.tensor(prompt_ids)[mm_token_type_ids[0] == 1]
        chunked_mm_tokens = torch.split(mm_tokens, split_sizes)
        ranges = [
            PlaceholderRange(
                offset=positions[0].item(),
                length=positions.shape[0],
                is_embed=(mm_tokens == hf_processor.image_token_id).bool(),
            )
            for positions, mm_tokens in zip(chunked_mm_positions, chunked_mm_tokens)
        ]
        mm_placeholders["image"] = ranges

    processed_data["num_image_patches"] = torch.tensor(
        mm_tokens_per_modality["num_image_patches"]
    )

    # video_token_ids
    mm_positions = torch.where(mm_token_type_ids == 2)[1]

    split_sizes = mm_tokens_per_modality["num_video_tokens"]
    if split_sizes:
        chunked_mm_positions = torch.split(mm_positions, split_sizes)
        mm_tokens = torch.tensor(prompt_ids)[mm_token_type_ids[0] == 2]
        chunked_mm_tokens = torch.split(mm_tokens, split_sizes)
        ranges = [
            PlaceholderRange(
                offset=positions[0].item(),
                length=positions.shape[0],
                is_embed=(mm_tokens == hf_processor.video_token_id).bool(),
            )
            for positions, mm_tokens in zip(chunked_mm_positions, chunked_mm_tokens)
        ]
        mm_placeholders["video"] = ranges

    processed_data["num_video_patches"] = torch.tensor(
        mm_tokens_per_modality["num_video_patches"]
    )

    mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
        processed_data,
        self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs),
    )

    # Use overrides if provided; fallback to data-dependent hashing.
    mm_hashes = self._hash_mm_items(
        mm_items, hf_processor_mm_kwargs, tokenization_kwargs, mm_uuids=mm_uuids
    )

    return MultiModalInputs(
        type="multimodal",
        prompt_token_ids=prompt_ids,
        mm_kwargs=mm_kwargs,
        mm_hashes=mm_hashes,
        mm_placeholders=mm_placeholders,
    )