|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""DiffusionVL (Qwen2.5-VL based) model configuration.""" |
|
|
|
|
|
from typing import List, Optional, Union |
|
|
|
|
|
from transformers.configuration_utils import PretrainedConfig |
|
|
|
|
|
|
|
|
class DiffusionVL_Qwen2_5_VL_VisionConfig(PretrainedConfig): |
|
|
r""" |
|
|
This is the configuration class to store the configuration of a [`DiffusionVL_Qwen2_5_VL_VisionModel`]. |
|
|
It is used to instantiate the vision encoder according to the specified arguments. |
|
|
|
|
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. |
|
|
Read the documentation from [`PretrainedConfig`] for more information. |
|
|
|
|
|
Args: |
|
|
depth (`int`, *optional*, defaults to 32): |
|
|
Number of vision transformer layers. |
|
|
hidden_size (`int`, *optional*, defaults to 1280): |
|
|
Dimensionality of the encoder layers and the pooler layer. |
|
|
hidden_act (`str`, *optional*, defaults to `"silu"`): |
|
|
The non-linear activation function in the encoder. |
|
|
intermediate_size (`int`, *optional*, defaults to 3420): |
|
|
Dimensionality of the "intermediate" (i.e., feed-forward) layer. |
|
|
num_heads (`int`, *optional*, defaults to 16): |
|
|
Number of attention heads for each attention layer. |
|
|
in_channels (`int`, *optional*, defaults to 3): |
|
|
Number of input channels. |
|
|
patch_size (`int`, *optional*, defaults to 14): |
|
|
The size of each image patch. |
|
|
spatial_merge_size (`int`, *optional*, defaults to 2): |
|
|
The spatial merge size for patch merging. |
|
|
temporal_patch_size (`int`, *optional*, defaults to 2): |
|
|
The temporal patch size for video processing. |
|
|
tokens_per_second (`int`, *optional*, defaults to 4): |
|
|
Number of tokens per second for video processing. |
|
|
window_size (`int`, *optional*, defaults to 112): |
|
|
Window size for windowed attention. |
|
|
out_hidden_size (`int`, *optional*, defaults to 3584): |
|
|
Output hidden size after the vision encoder. |
|
|
fullatt_block_indexes (`List[int]`, *optional*): |
|
|
Indices of blocks that use full attention instead of windowed attention. |
|
|
initializer_range (`float`, *optional*, defaults to 0.02): |
|
|
The standard deviation of the truncated_normal_initializer for initializing weight matrices. |
|
|
|
|
|
Example: |
|
|
|
|
|
```python |
|
|
>>> from configuration_diffusionvl_qwen2_5_vl import DiffusionVL_Qwen2_5_VL_VisionConfig |
|
|
|
|
|
>>> # Initializing a DiffusionVL vision configuration |
|
|
>>> configuration = DiffusionVL_Qwen2_5_VL_VisionConfig() |
|
|
``` |
|
|
""" |
|
|
|
|
|
model_type = "diffusionvl_qwen2_5_vl_vision" |
|
|
base_config_key = "vision_config" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
depth: int = 32, |
|
|
hidden_size: int = 1280, |
|
|
hidden_act: str = "silu", |
|
|
intermediate_size: int = 3420, |
|
|
num_heads: int = 16, |
|
|
in_channels: int = 3, |
|
|
patch_size: int = 14, |
|
|
spatial_merge_size: int = 2, |
|
|
temporal_patch_size: int = 2, |
|
|
tokens_per_second: int = 4, |
|
|
window_size: int = 112, |
|
|
out_hidden_size: int = 3584, |
|
|
fullatt_block_indexes: Optional[List[int]] = None, |
|
|
initializer_range: float = 0.02, |
|
|
**kwargs, |
|
|
): |
|
|
super().__init__(**kwargs) |
|
|
|
|
|
self.depth = depth |
|
|
self.hidden_size = hidden_size |
|
|
self.hidden_act = hidden_act |
|
|
self.intermediate_size = intermediate_size |
|
|
self.num_heads = num_heads |
|
|
self.in_channels = in_channels |
|
|
self.patch_size = patch_size |
|
|
self.spatial_merge_size = spatial_merge_size |
|
|
self.temporal_patch_size = temporal_patch_size |
|
|
self.tokens_per_second = tokens_per_second |
|
|
self.window_size = window_size |
|
|
self.out_hidden_size = out_hidden_size |
|
|
self.fullatt_block_indexes = fullatt_block_indexes or [7, 15, 23, 31] |
|
|
self.initializer_range = initializer_range |
|
|
|
|
|
|
|
|
class DiffusionVL_Qwen2_5_VL_Config(PretrainedConfig): |
|
|
r""" |
|
|
This is the configuration class to store the configuration of a [`DiffusionVL_Qwen2_5_VL_ForConditionalGeneration`]. |
|
|
It is used to instantiate a DiffusionVL model according to the specified arguments. |
|
|
|
|
|
DiffusionVL extends Qwen2.5-VL architecture with BD3LM (Block Diffusion Language Model) |
|
|
for diffusion-based text generation instead of autoregressive decoding. |
|
|
|
|
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. |
|
|
Read the documentation from [`PretrainedConfig`] for more information. |
|
|
|
|
|
Args: |
|
|
vocab_size (`int`, *optional*, defaults to 152064): |
|
|
Vocabulary size of the DiffusionVL model. |
|
|
hidden_size (`int`, *optional*, defaults to 3584): |
|
|
Dimension of the hidden representations. |
|
|
intermediate_size (`int`, *optional*, defaults to 18944): |
|
|
Dimension of the MLP representations. |
|
|
num_hidden_layers (`int`, *optional*, defaults to 28): |
|
|
Number of hidden layers in the Transformer encoder. |
|
|
num_attention_heads (`int`, *optional*, defaults to 28): |
|
|
Number of attention heads for each attention layer. |
|
|
num_key_value_heads (`int`, *optional*, defaults to 4): |
|
|
Number of key-value heads for Grouped Query Attention (GQA). |
|
|
hidden_act (`str`, *optional*, defaults to `"silu"`): |
|
|
The non-linear activation function in the decoder. |
|
|
max_position_embeddings (`int`, *optional*, defaults to 128000): |
|
|
The maximum sequence length that this model might ever be used with. |
|
|
initializer_range (`float`, *optional*, defaults to 0.02): |
|
|
The standard deviation of the truncated_normal_initializer for initializing weight matrices. |
|
|
rms_norm_eps (`float`, *optional*, defaults to 1e-6): |
|
|
The epsilon used by the RMS normalization layers. |
|
|
use_cache (`bool`, *optional*, defaults to `True`): |
|
|
Whether to use the past key/values attentions. |
|
|
tie_word_embeddings (`bool`, *optional*, defaults to `False`): |
|
|
Whether the model's input and output word embeddings should be tied. |
|
|
attention_dropout (`float`, *optional*, defaults to 0.0): |
|
|
The dropout ratio for the attention probabilities. |
|
|
vision_config (`DiffusionVL_Qwen2_5_VL_VisionConfig`, *optional*): |
|
|
The configuration for the vision encoder. |
|
|
image_token_id (`int`, *optional*, defaults to 151655): |
|
|
The token index for image placeholder. |
|
|
video_token_id (`int`, *optional*, defaults to 151656): |
|
|
The token index for video placeholder. |
|
|
vision_start_token_id (`int`, *optional*, defaults to 151652): |
|
|
The token index denoting start of vision input. |
|
|
vision_end_token_id (`int`, *optional*, defaults to 151653): |
|
|
The token index denoting end of vision input. |
|
|
enable_bd3lm (`bool`, *optional*, defaults to `True`): |
|
|
Whether to enable BD3LM diffusion-based generation. |
|
|
bd3lm_block_size (`int`, *optional*, defaults to 8): |
|
|
Block size for BD3LM generation. |
|
|
bd3lm_cross_attn (`bool`, *optional*, defaults to `True`): |
|
|
Whether to use cross-attention in BD3LM. |
|
|
mask_token_id (`int`, *optional*, defaults to 151671): |
|
|
The token index for mask token used in diffusion. |
|
|
rope_theta (`float`, *optional*, defaults to 1000000.0): |
|
|
The base period of the RoPE embeddings. |
|
|
rope_scaling (`Dict`, *optional*): |
|
|
Dictionary containing the scaling configuration for RoPE embeddings. |
|
|
|
|
|
Example: |
|
|
|
|
|
```python |
|
|
>>> from transformers import AutoModelForCausalLM |
|
|
>>> from configuration_diffusionvl_qwen2_5_vl import DiffusionVL_Qwen2_5_VL_Config |
|
|
|
|
|
>>> # Initializing a DiffusionVL configuration |
|
|
>>> configuration = DiffusionVL_Qwen2_5_VL_Config() |
|
|
|
|
|
>>> # Initializing a model from the configuration |
|
|
>>> model = AutoModelForCausalLM.from_pretrained( |
|
|
... "path/to/model", config=configuration, trust_remote_code=True |
|
|
... ) |
|
|
|
|
|
>>> # Accessing the model configuration |
|
|
>>> configuration = model.config |
|
|
``` |
|
|
""" |
|
|
|
|
|
model_type = "diffusionvl_qwenvl" |
|
|
sub_configs = {"vision_config": DiffusionVL_Qwen2_5_VL_VisionConfig} |
|
|
keys_to_ignore_at_inference = ["past_key_values"] |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
vocab_size: int = 152064, |
|
|
hidden_size: int = 3584, |
|
|
intermediate_size: int = 18944, |
|
|
num_hidden_layers: int = 28, |
|
|
num_attention_heads: int = 28, |
|
|
num_key_value_heads: int = 4, |
|
|
hidden_act: str = "silu", |
|
|
max_position_embeddings: int = 128000, |
|
|
initializer_range: float = 0.02, |
|
|
rms_norm_eps: float = 1e-6, |
|
|
use_cache: bool = True, |
|
|
tie_word_embeddings: bool = False, |
|
|
attention_dropout: float = 0.0, |
|
|
|
|
|
vision_config: Optional[Union[DiffusionVL_Qwen2_5_VL_VisionConfig, dict]] = None, |
|
|
|
|
|
image_token_id: int = 151655, |
|
|
video_token_id: int = 151656, |
|
|
vision_start_token_id: int = 151652, |
|
|
vision_end_token_id: int = 151653, |
|
|
|
|
|
enable_bd3lm: bool = True, |
|
|
bd3lm_block_size: int = 8, |
|
|
bd3lm_cross_attn: bool = True, |
|
|
bd3lm_antithetic_sampling: bool = True, |
|
|
bd3lm_sampling_eps_min: float = 1e-3, |
|
|
bd3lm_sampling_eps_max: float = 1.0, |
|
|
mask_token_id: int = 151671, |
|
|
|
|
|
rope_theta: float = 1000000.0, |
|
|
rope_scaling: Optional[dict] = None, |
|
|
**kwargs, |
|
|
): |
|
|
|
|
|
|
|
|
kwargs.pop("text_config", None) |
|
|
|
|
|
|
|
|
self.vocab_size = vocab_size |
|
|
self.hidden_size = hidden_size |
|
|
self.intermediate_size = intermediate_size |
|
|
self.num_hidden_layers = num_hidden_layers |
|
|
self.num_attention_heads = num_attention_heads |
|
|
self.num_key_value_heads = num_key_value_heads |
|
|
self.hidden_act = hidden_act |
|
|
self.max_position_embeddings = max_position_embeddings |
|
|
self.initializer_range = initializer_range |
|
|
self.rms_norm_eps = rms_norm_eps |
|
|
self.use_cache = use_cache |
|
|
self.attention_dropout = attention_dropout |
|
|
self.rope_theta = rope_theta |
|
|
self.rope_scaling = rope_scaling or { |
|
|
"mrope_section": [16, 24, 24], |
|
|
"rope_type": "default", |
|
|
"type": "default", |
|
|
} |
|
|
|
|
|
|
|
|
if vision_config is None: |
|
|
self.vision_config = DiffusionVL_Qwen2_5_VL_VisionConfig() |
|
|
elif isinstance(vision_config, dict): |
|
|
self.vision_config = DiffusionVL_Qwen2_5_VL_VisionConfig(**vision_config) |
|
|
elif isinstance(vision_config, DiffusionVL_Qwen2_5_VL_VisionConfig): |
|
|
self.vision_config = vision_config |
|
|
else: |
|
|
self.vision_config = DiffusionVL_Qwen2_5_VL_VisionConfig() |
|
|
|
|
|
|
|
|
self.image_token_id = image_token_id |
|
|
self.video_token_id = video_token_id |
|
|
self.vision_start_token_id = vision_start_token_id |
|
|
self.vision_end_token_id = vision_end_token_id |
|
|
|
|
|
|
|
|
self.enable_bd3lm = enable_bd3lm |
|
|
self.bd3lm_block_size = bd3lm_block_size |
|
|
self.bd3lm_cross_attn = bd3lm_cross_attn |
|
|
self.bd3lm_antithetic_sampling = bd3lm_antithetic_sampling |
|
|
self.bd3lm_sampling_eps_min = bd3lm_sampling_eps_min |
|
|
self.bd3lm_sampling_eps_max = bd3lm_sampling_eps_max |
|
|
self.mask_token_id = mask_token_id |
|
|
|
|
|
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) |
|
|
|
|
|
|
|
|
__all__ = ["DiffusionVL_Qwen2_5_VL_Config", "DiffusionVL_Qwen2_5_VL_VisionConfig"] |
|
|
|