hqfang's picture
update to fp32 weights
dc0553f
model_name: molmo
llm:
d_model: 3584
n_heads: 28
n_kv_heads: 4
head_dim: null
qkv_bias: true
clip_qkv: null
n_layers: 28
mlp_ratio: 4
mlp_hidden_size: 37888
activation_type: swiglu
block_type: sequential
rope: true
rope_full_precision: true
rope_theta: 1000000.0
rope_type: default
rope_factor: null
rope_high_freq_factor: null
rope_low_freq_factor: null
rope_original_max_position_embeddings: null
attention_type: sdpa
float32_attention: true
attention_dropout: 0.0
attention_layer_norm: false
attention_layer_norm_type: olmo
residual_dropout: 0.1
response_residual_dropout: 0.0
layer_norm_type: rms
layer_norm_with_affine: true
layer_norm_eps: 1.0e-06
attention_layer_norm_with_affine: true
max_sequence_length: 4096
max_position_embeddings: null
include_bias: false
bias_for_layer_norm: null
norm_after: false
moe_num_experts: 8
moe_top_k: 2
moe_mlp_impl: sparse
moe_log_expert_assignment: false
moe_shared_expert: false
moe_lbl_in_fp32: false
moe_interleave: false
moe_loss_weight: 0.1
moe_zloss_weight: null
moe_dropless: true
moe_capacity_factor: 1.25
embedding_dropout: 0.0
scale_logits: false
vocab_size: 152064
additional_vocab_size: 128
weight_tying: false
embedding_size: 152064
use_position_ids: true
tokenizer:
identifier: Qwen/Qwen2.5-7B
tokenizer_dir: null
depth_tokens: true
init_path: gs://mm-olmo/pretrained_llms/qwen2.5-7b.pt
init_incremental: null
new_embedding_init_range: 0.02
initializer_range: 0.02
normalize_input_embeds: false
activation_checkpoint: whole_layer
compile: blocks
fix_pad_tokenizer: false
resize_vocab: false
init_std: 0.02
init_fn: normal
init_cutoff_factor: null
vision_backbone:
vit:
image_model_type: siglip
image_default_input_size:
- 378
- 378
image_patch_size: 14
image_pos_patch_size: 14
image_emb_dim: 1152
image_num_heads: 16
image_num_key_value_heads: 16
image_num_layers: 27
image_head_dim: 72
image_mlp_dim: 4304
image_mlp_activations: gelu_pytorch_tanh
image_dropout_rate: 0.0
image_num_pos: 729
image_norm_eps: 1.0e-06
attention_dropout: 0.0
residual_dropout: 0.0
initializer_range: 0.02
float32_attention: true
attention_type: sdpa
activation_checkpointing: true
init_path: gs://mm-olmo/pretrained_image_encoders/siglip2-so400m-14-384.pt
resize_mode: siglip
pad_value: 0.0
normalize: siglip
image_pooling_2d: attention_meanq
pooling_attention_mask: false
image_projector: mlp
image_padding_embed: null
vit_layers:
- -3
- -9
skip_unused_layers: true
image_feature_dropout: 0.0
connector_activation_checkpointing: true
compile_vit: blocks
data_formatter:
prompt_templates: uber_model
message_format: role
system_prompt: demo_or_style
always_start_with_space: false
default_inference_len: 65
select_answer: best
debug: false
image_last: false
format_message_list: null
p_one_message: 0.0
mm_preprocessor:
crop_mode: overlap-and-resize-c2
max_crops: 8
max_images: 2
max_multi_image_crops: 8
pooling_w: 2
pooling_h: 2
overlap_margins:
- 4
- 4
use_col_tokens: true
loss_token_weighting: root_subsegments
legacy_image_mask: false
max_answer_len: null
img_aug: true
bi_directional_attn: null
lora_enable: true
lora_rank: 32
lora_alpha: 16
lora_dropout: 0.0
lora_bias: none
n_action_bins: 256
norm_stats:
libero_object_no_noops_modified:
action:
mean:
- 0.07096529006958008
- 0.13498851656913757
- -0.04601382836699486
- 0.00123520044144243
- 0.006998839322477579
- -0.015027612447738647
- 0.46428999304771423
std:
- 0.2681235373020172
- 0.43846824765205383
- 0.4474974274635315
- 0.024446550756692886
- 0.049355510622262955
- 0.042107198387384415
- 0.49879148602485657
max:
- 0.9375
- 0.8919642567634583
- 0.9375
- 0.17678570747375488
- 0.35035714507102966
- 0.1810714304447174
- 1.0
min:
- -0.8839285969734192
- -0.9375
- -0.9375
- -0.15000000596046448
- -0.29035714268684387
- -0.32892856001853943
- 0.0
q01:
- -0.5383928418159485
- -0.8758928775787354
- -0.9375
- -0.06964285671710968
- -0.11678571254014969
- -0.15964286029338837
- 0.0
q99:
- 0.8464285731315613
- 0.84375
- 0.9375
- 0.08142857253551483
- 0.14892856776714325
- 0.0867857113480568
- 1.0
proprio:
mean:
- -0.02999030612409115
- -0.007947085425257683
- 0.20293472707271576
- 3.1086409091949463
- -0.21404768526554108
- -0.11307074874639511
- 0.0
- 0.029380427673459053
- -0.030556727200746536
std:
- 0.06694897264242172
- 0.17608462274074554
- 0.07807064801454544
- 0.0868484303355217
- 0.33540457487106323
- 0.20728276669979095
- 0.0
- 0.00956575945019722
- 0.009197483770549297
max:
- 0.14580604434013367
- 0.33216384053230286
- 0.3857804834842682
- 3.4003844261169434
- 0.7954911589622498
- 0.6642207503318787
- 0.0
- 0.04104341194033623
- -0.00018117300351150334
min:
- -0.1765444278717041
- -0.29457300901412964
- 0.008128180168569088
- 2.2890501022338867
- -1.883241891860962
- -1.0600427389144897
- 0.0
- 0.0006495157140307128
- -0.041782498359680176
q01:
- -0.14911890715360643
- -0.25978428691625594
- 0.009925739830359817
- 2.7545341420173646
- -1.3996034812927245
- -0.6867720144987106
- 0.0
- 0.008197814421728254
- -0.04015838988125324
q99:
- 0.09063626825809479
- 0.29066365867853167
- 0.3370887073874472
- 3.2611824750900267
- 0.32092821151018125
- 0.4037663781642913
- 0.0
- 0.039891827926039694
- -0.009106044843792932
num_transitions: 66984
num_trajectories: 454