{ "model_type": "xy_tokenizer", "auto_map": { "AutoFeatureExtractor": "feature_extraction_xy_tokenizer.XYTokenizerFeatureExtractor", "AutoConfig": "configuration_xy_tokenizer.XYTokenizerConfig", "AutoModel": "modeling_xy_tokenizer.XYTokenizerModel" }, "input_sample_rate": 16000, "output_sample_rate": 24000, "encoder_downsample_rate": 1280, "decoder_upsample_rate": 1920, "code_dim": 3072, "params": { "feature_extractor_kwargs": { "chunk_length": 30, "feature_size": 80, "hop_length": 160, "n_fft": 400, "n_samples": 480000, "nb_max_frames": 3000, "padding_side": "right", "padding_value": 0.0, "sampling_rate": 16000, "return_attention_mask": true, "return_tensors": "pt" }, "semantic_encoder_kwargs": { "num_mel_bins": 80, "sampling_rate": 16000, "hop_length": 160, "stride_size": 2, "kernel_size": 3, "d_model": 768, "scale_embedding": false, "max_audio_seconds": 30, "encoder_layers": 12, "encoder_attention_heads": 12, "encoder_ffn_dim": 3072, "activation_function": "gelu" }, "semantic_encoder_adapter_kwargs": { "input_dim": 768, "output_dim": 768, "d_model": 768, "max_source_positions": 1500, "encoder_layers": 4, "encoder_attention_heads": 12, "encoder_ffn_dim": 3072 }, "acoustic_encoder_kwargs": { "num_mel_bins": 80, "sampling_rate": 16000, "hop_length": 160, "stride_size": 2, "kernel_size": 3, "d_model": 768, "scale_embedding": false, "max_audio_seconds": 30, "encoder_layers": 12, "encoder_attention_heads": 12, "encoder_ffn_dim": 3072, "activation_function": "gelu" }, "pre_rvq_adapter_kwargs": { "input_dim": 1536, "output_dim": 768, "d_model": 768, "max_source_positions": 1500, "encoder_layers": 4, "encoder_attention_heads": 12, "encoder_ffn_dim": 3072 }, "downsample_kwargs": { "d_model": 768, "avg_pooler": 4 }, "quantizer_kwargs": { "input_dim": 3072, "rvq_dim": 512, "output_dim": 3072, "num_quantizers": 8, "codebook_size": 1024, "codebook_dim": 512, "quantizer_dropout": 0.0 }, "post_rvq_adapter_kwargs": { "input_dim": 3072, "output_dim": 3072, "d_model": 768, "max_source_positions": 375, "encoder_layers": 4, "encoder_attention_heads": 12, "encoder_ffn_dim": 3072 }, "upsample_kwargs": { "d_model": 768, "stride": 4 }, "acoustic_decoder_kwargs": { "num_mel_bins": 80, "sampling_rate": 16000, "hop_length": 160, "stride_size": 2, "kernel_size": 3, "d_model": 768, "scale_embedding": false, "max_audio_seconds": 30, "decoder_layers": 12, "decoder_attention_heads": 12, "decoder_ffn_dim": 3072, "activation_function": "gelu" }, "vocos_kwargs": { "input_channels": 80, "dim": 512, "intermediate_dim": 4096, "num_layers": 30, "n_fft": 960, "hop_size": 240, "padding": "same" } }, "torch_dtype": "float32", "transformers_version": "4.51.0" }