Cqy2019's picture
automodel_remote_code_support (#2)
c884072 verified
{
"model_type": "xy_tokenizer",
"auto_map": {
"AutoFeatureExtractor": "feature_extraction_xy_tokenizer.XYTokenizerFeatureExtractor",
"AutoConfig": "configuration_xy_tokenizer.XYTokenizerConfig",
"AutoModel": "modeling_xy_tokenizer.XYTokenizerModel"
},
"input_sample_rate": 16000,
"output_sample_rate": 24000,
"encoder_downsample_rate": 1280,
"decoder_upsample_rate": 1920,
"code_dim": 3072,
"params": {
"feature_extractor_kwargs": {
"chunk_length": 30,
"feature_size": 80,
"hop_length": 160,
"n_fft": 400,
"n_samples": 480000,
"nb_max_frames": 3000,
"padding_side": "right",
"padding_value": 0.0,
"sampling_rate": 16000,
"return_attention_mask": true,
"return_tensors": "pt"
},
"semantic_encoder_kwargs": {
"num_mel_bins": 80,
"sampling_rate": 16000,
"hop_length": 160,
"stride_size": 2,
"kernel_size": 3,
"d_model": 768,
"scale_embedding": false,
"max_audio_seconds": 30,
"encoder_layers": 12,
"encoder_attention_heads": 12,
"encoder_ffn_dim": 3072,
"activation_function": "gelu"
},
"semantic_encoder_adapter_kwargs": {
"input_dim": 768,
"output_dim": 768,
"d_model": 768,
"max_source_positions": 1500,
"encoder_layers": 4,
"encoder_attention_heads": 12,
"encoder_ffn_dim": 3072
},
"acoustic_encoder_kwargs": {
"num_mel_bins": 80,
"sampling_rate": 16000,
"hop_length": 160,
"stride_size": 2,
"kernel_size": 3,
"d_model": 768,
"scale_embedding": false,
"max_audio_seconds": 30,
"encoder_layers": 12,
"encoder_attention_heads": 12,
"encoder_ffn_dim": 3072,
"activation_function": "gelu"
},
"pre_rvq_adapter_kwargs": {
"input_dim": 1536,
"output_dim": 768,
"d_model": 768,
"max_source_positions": 1500,
"encoder_layers": 4,
"encoder_attention_heads": 12,
"encoder_ffn_dim": 3072
},
"downsample_kwargs": {
"d_model": 768,
"avg_pooler": 4
},
"quantizer_kwargs": {
"input_dim": 3072,
"rvq_dim": 512,
"output_dim": 3072,
"num_quantizers": 8,
"codebook_size": 1024,
"codebook_dim": 512,
"quantizer_dropout": 0.0
},
"post_rvq_adapter_kwargs": {
"input_dim": 3072,
"output_dim": 3072,
"d_model": 768,
"max_source_positions": 375,
"encoder_layers": 4,
"encoder_attention_heads": 12,
"encoder_ffn_dim": 3072
},
"upsample_kwargs": {
"d_model": 768,
"stride": 4
},
"acoustic_decoder_kwargs": {
"num_mel_bins": 80,
"sampling_rate": 16000,
"hop_length": 160,
"stride_size": 2,
"kernel_size": 3,
"d_model": 768,
"scale_embedding": false,
"max_audio_seconds": 30,
"decoder_layers": 12,
"decoder_attention_heads": 12,
"decoder_ffn_dim": 3072,
"activation_function": "gelu"
},
"vocos_kwargs": {
"input_channels": 80,
"dim": 512,
"intermediate_dim": 4096,
"num_layers": 30,
"n_fft": 960,
"hop_size": 240,
"padding": "same"
}
},
"torch_dtype": "float32",
"transformers_version": "4.51.0"
}