{
  "type": "smolvla",
  "n_obs_steps": 1,
  "input_features": {
    "observation.state": {
      "type": "STATE",
      "shape": [6]
    },
    "observation.images.camera1": {
      "type": "VISUAL",
      "shape": [3, 256]
    },
    "observation.images.camera2": {
      "type": "VISUAL",
      "shape": [3, 256]
    },
    "observation.images.camera3": {
      "type": "VISUAL",
      "shape": [3]
    }
  },
  "device": "cuda",
  "use_amp": false,
  "push_to_hub": true,
  "repo_id": null,
  "private": null,
  "tags": null,
  "license": null,
  "chunk_size": 50,
  "n_action_steps": 50,
  "normalization_mapping": {
    "VISUAL": "IDENTITY",
    "STATE": "MEAN_STD",
    "ACTION": "MEAN_STD"
  },
  "max_state_dim": 32,
  "max_action_dim": 32,
  "resize_imgs_with_padding": [0, 0],
  "scheduler_decay_lr": 2.5e-06,
  "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
  "load_vlm_weights": true,
  "add_image_special_tokens": false,
  "attention_mode": "cross_attn",
  "prefix_length": 0,
  "pad_language_to": "max_length",
  "num_expert_layers": 0,
  "num_vlm_layers": 16,
  "self_attn_every_n_layers": 2,
  "expert_width_multiplier": 0.75,
  "min_period": 0.004,
  "max_period": 4.0
}