# coding=utf-8 # Copyright 2024 Descript and The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """XYTokenizer model configuration""" from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging logger = logging.get_logger(__name__) class XYTokenizerConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`XYTokenizerModel`]. It is used to instantiate a XY Tokenizer model according to the specified arguments, defining the model architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: input_sample_rate (`int`, *optional*, defaults to 16000): The sampling rate of the input audio. output_sample_rate (`int`, *optional*, defaults to 16000): The sampling rate of the output audio. encoder_downsample_rate (`int`, *optional*, defaults to 1280): The total downsampling factor of the encoder part. decoder_upsample_rate (`int`, *optional*, defaults to 1920): The total upsampling factor of the decoder part. code_dim (`int`, *optional*, defaults to 1280): The dimension of the code embeddings. // ... (All other parameters from the original YAML/dict config would be listed here) ... // For brevity, we will define them with default values based on the provided code. Example: semantic_encoder_d_model (`int`, *optional*, defaults to 1280): Hidden dimension for the semantic encoder. num_quantizers (`int`, *optional*, defaults to 32): Number of residual quantizers. ... """ model_type = "xy_tokenizer" # A comprehensive config would flatten all nested kwargs from the original `generator_params`. # For this example, we will create a simplified version. A real implementation would # have all parameters explicitly defined here. def __init__( self, input_sample_rate=16000, output_sample_rate=16000, encoder_downsample_rate=1280, decoder_upsample_rate=1920, code_dim=1280, # A real config would have dozens of parameters here. # We will dynamically accept them via **kwargs. **kwargs, ): self.input_sample_rate = input_sample_rate self.output_sample_rate = output_sample_rate self.encoder_downsample_rate = encoder_downsample_rate self.decoder_upsample_rate = decoder_upsample_rate self.code_dim = code_dim # Store all other parameters dynamically. This is a shortcut. # A production-ready config should list all parameters explicitly. self.params = kwargs super().__init__(**kwargs) __all__ = ["XYTokenizerConfig"]