The model uses the following token sequence order for multimodal inputs:
[Image tokens] [Point Cloud tokens] [Instruction + Description] [CAD Commands]
256 1+256 variable variable
# Stage 3
python scripts/train_curriculum.py \
--attn_implementation flash_attention_2 \
--use_gradient_checkpointing \
--use_wandb \
--output_dir ./CAD-MLLM-checkpoints \
--omnicad_json_root ./data/Omni-CAD-subset-complete/json \
--omnicad_txt_path ./data/Omni-CAD-subset-complete/txt \
--omnicad_img_root ./data/Omni-CAD-subset-complete/img \
--omnicad_pc_root ./data/Omni-CAD-subset-complete/pointcloud \
--llm_model_name "Qwen/Qwen3-8B" \
--resume_from_ckpt "./CAD-MLLM-checkpoints/stage3_all/checkpoint-epoch0-step100-20251129_213538" \
--start_from_stage 3 \
--stage1_epochs 0 \
--stage2_epochs 0 \
--stage3_epochs 10 \
--stage3_lr 2e-4 \
--max_seq_length 8192 \
--batch_size 1 \
--gradient_accumulation_steps 16 \
--lora_r 32 \
--lora_alpha 64 \
--warmup_steps 50 \
--logging_steps 1000 \
--save_steps 100 \
--device cuda \
--dtype bfloat16 \
--wandb_project "CAD-MLLM-stage_3" \
--wandb_run_name "latent_pc" \
--wandb_entity "yizhuod-cmu" \
--seed 30
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
🙋
Ask for provider support