Xmodel-2.5 / configuration_xmodel2.py
foamliu's picture
Add files using upload-large-folder tool
fefdb7b verified
raw
history blame
9.31 kB
# Copyright (c) 2023 XiaoDuo AI. All rights reserved.
from typing import Optional, Dict
from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_rope_utils import rope_config_validation
from transformers.utils import logging
from typing_extensions import Self
logger = logging.get_logger(__name__)
class XmodelConfig(PretrainedConfig):
"""Configuration class for Xmodel models.
Args:
vocab_size (int): Vocabulary size of the model
hidden_size (int): Dimensionality of the embeddings and hidden states
intermediate_size (int): Dimensionality of the "intermediate" (feed-forward) layer
num_hidden_layers (int): Number of hidden layers in the Transformer encoder
num_attention_heads (int): Number of attention heads for each attention layer
num_key_value_heads (int): Number of key/value heads (for grouped query attention)
hidden_act (str): Activation function for hidden layers
max_position_embeddings (int): Maximum sequence length
initializer_range (float): Standard deviation for weight initialization
rms_norm_eps (float): Epsilon for RMS layer normalization
use_cache (bool): Whether to use caching for faster generation
rope_theta (float): Base frequency for rotary embeddings
rope_scaling (Optional[Dict]): Scaling configuration for rotary embeddings
attention_bias (bool): Whether to use bias in attention layers
attention_dropout (float): Dropout probability for attention layers
mlp_bias (bool): Whether to use bias in MLP layers
scale_emb (float): Scaling factor for embeddings
dim_model_base (int): Base dimension for model scaling
scale_depth (float): Depth scaling factor
"""
model_type = "xmodel"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size: int = 32000,
hidden_size: int = 1536,
intermediate_size: Optional[int] = 4096,
num_hidden_layers: int = 48,
num_attention_heads: int = 24,
num_key_value_heads: Optional[int] = 8,
hidden_act: str = "silu",
max_position_embeddings: int = 4096,
initializer_range: float = 0.1,
rms_norm_eps: float = 1e-5,
use_cache: bool = True,
pad_token_id: Optional[int] = None,
bos_token_id: int = 1,
eos_token_id: int = 2,
pretraining_tp: int = 1,
tie_word_embeddings: bool = True,
rope_theta=10000.0,
rope_scaling=None,
attention_bias: bool = False,
attention_dropout: float = 0.0,
mlp_bias: bool = False,
use_mup: bool = False,
mup_input_scale: float = 12.0,
mup_output_scale: float = 1.0,
mup_attention_residual_scale: float = 1.4,
mup_ffn_residual_scale: float = 1.4,
mup_base_width: int = 256,
**kwargs,
):
# Validate input parameters
if vocab_size <= 0:
raise ValueError(f"vocab_size must be positive, got {vocab_size}")
if hidden_size <= 0:
raise ValueError(f"hidden_size must be positive, got {hidden_size}")
if num_hidden_layers <= 0:
raise ValueError(f"num_hidden_layers must be positive, got {num_hidden_layers}")
if num_attention_heads <= 0:
raise ValueError(f"num_attention_heads must be positive, got {num_attention_heads}")
if max_position_embeddings <= 0:
raise ValueError(f"max_position_embeddings must be positive, got {max_position_embeddings}")
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
# Calculate intermediate size if not provided
if intermediate_size is None:
self.intermediate_size = find_multiple(int(8 * hidden_size / 3), 256)
else:
if intermediate_size <= 0:
raise ValueError(f"intermediate_size must be positive, got {intermediate_size}")
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
# Handle grouped query attention
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
elif num_key_value_heads > num_attention_heads:
raise ValueError(
f"num_key_value_heads ({num_key_value_heads}) must be <= num_attention_heads ({num_attention_heads})"
)
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.pretraining_tp = pretraining_tp
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.mlp_bias = mlp_bias
self.use_mup = use_mup
self.mup_input_scale = mup_input_scale
self.mup_output_scale = mup_output_scale
self.mup_attention_residual_scale = mup_attention_residual_scale
self.mup_ffn_residual_scale = mup_ffn_residual_scale
self.mup_base_width = mup_base_width
self.mup_width_multiplier = hidden_size / mup_base_width
self._attn_implementation = "eager"
self.auto_map = {
"AutoConfig": "configuration_xmodel2.XmodelConfig",
"AutoModelForCausalLM": "modeling_xmodel2.XmodelForCausalLM"
}
# Validate the correctness of rotary position embeddings parameters
# BC: if there is a 'type' field, move it to 'rope_type'.
if self.rope_scaling is not None and "type" in self.rope_scaling:
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
rope_config_validation(self)
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
@classmethod
def from_name(cls, name: str) -> Self:
"""Create a configuration from a preset model name.
Args:
name (str): Name of the model preset (e.g. 'nano', 'small', 'xl')
Returns:
XmodelConfig: Configuration instance
Raises:
ValueError: If the model name is not found in presets
"""
if name not in xmodel_configs:
raise ValueError(
f"Unknown model name '{name}'. Available models: {list(xmodel_configs.keys())}"
)
return cls(**xmodel_configs[name])
xmodel_configs = {
"nano": dict(num_hidden_layers=8,
num_attention_heads=4,
num_key_value_heads=1,
hidden_size=256,
tie_word_embeddings=True,
intermediate_size=640),
"micro": dict(num_hidden_layers=12,
num_attention_heads=6,
num_key_value_heads=1,
hidden_size=384,
tie_word_embeddings=True,
intermediate_size=960),
"tiny": dict(num_hidden_layers=18,
num_attention_heads=8,
num_key_value_heads=4,
hidden_size=512,
tie_word_embeddings=True,
intermediate_size=1280),
# GPT-1 & Bert-Base
"small": dict(num_hidden_layers=30,
num_attention_heads=9,
num_key_value_heads=3,
hidden_size=576,
tie_word_embeddings=True,
intermediate_size=1440),
# Bert-Large
"medium": dict(num_hidden_layers=32,
num_attention_heads=15,
num_key_value_heads=5,
hidden_size=960,
tie_word_embeddings=True,
intermediate_size=2400),
# GPT-2
"xl": dict(num_hidden_layers=48,
num_attention_heads=24,
num_key_value_heads=8,
hidden_size=1536,
tie_word_embeddings=True,
intermediate_size=3840), # GPT-2
"2B": dict(num_hidden_layers=40,
num_attention_heads=36,
num_key_value_heads=6,
hidden_size=2304,
tie_word_embeddings=True,
intermediate_size=5760),
"4B": dict(num_hidden_layers=62,
num_attention_heads=40,
num_key_value_heads=8,
hidden_size=2560,
tie_word_embeddings=True,
intermediate_size=6400),
}
def find_multiple(n: int, k: int) -> int:
"""Find the smallest multiple of k that is >= n.
Args:
n (int): Target number
k (int): Multiple base
Returns:
int: Smallest multiple of k >= n
Raises:
ValueError: If k <= 0
"""
if k <= 0:
raise ValueError(f"k must be positive, got {k}")
if n % k == 0:
return n
return n + k - (n % k)