Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

README.md +57 -0
__pycache__/__init__.cpython-310.pyc +0 -0
__pycache__/modeling_desta.cpython-310.pyc +0 -0
modeling_desta.py +6 -6

README.md ADDED Viewed

	@@ -0,0 +1,57 @@

+## DeSTA2
+[📑 Paper](https://arxiv.org/pdf/2409.20007) | [🌐 Website](https://kehanlu.github.io/DeSTA2/) | [👩‍💻 Github](https://github.com/kehanlu/DeSTA2) | [🤗 Model](https://huggingface.co/DeSTA-ntu/DeSTA2-8B-beta) | [🤗 Dataset](https://huggingface.co/datasets/DeSTA-ntu/DeSTA2-Llama3-8B-Instruct) |
+## Quickstart
+```python
+from huggingface import AutoModel
+HF_TOKEN = "hf_..." # your huggingface token for downloading Llama3 from official Meta repo
+model = AutoModel.from_pretrained("DeSTA-ntu/DeSTA2-8B-beta", trust_remote_code=True, token=HF_TOKEN)
+messages = [
+            {"role": "system", "content": "You are a helpful voice assistant."},
+            {"role": "audio", "content": "<path_to_audio_file>"},
+            {"role": "user", "content": "Describe the audio."}
+        ]
+generated_ids = model.chat(
+    messages,
+    max_new_tokens=128,
+    do_sample=True,
+    temperature=0.6,
+    top_p=0.9
+)
+response = model.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+print(response)
+```
+## Citation
+if you find our work useful, please consider citing the paper:
+```
+@article{lu2024developing,
+  title={Developing Instruction-Following Speech Language Model Without Speech Instruction-Tuning Data},
+  author={Lu, Ke-Han and Chen, Zhehuai and Fu, Szu-Wei and Yang, Chao-Han Huck and Balam, Jagadeesh and Ginsburg, Boris and Wang, Yu-Chiang Frank and Lee, Hung-yi},
+  journal={arXiv preprint arXiv:2409.20007},
+  year={2024}
+}
+@inproceedings{lu24c_interspeech,
+  title     = {DeSTA: Enhancing Speech Language Models through Descriptive Speech-Text Alignment},
+  author    = {Ke-Han Lu and Zhehuai Chen and Szu-Wei Fu and He Huang and Boris Ginsburg and Yu-Chiang Frank Wang and Hung-yi Lee},
+  year      = {2024},
+  booktitle = {Interspeech 2024},
+  pages     = {4159--4163},
+  doi       = {10.21437/Interspeech.2024-457},
+  issn      = {2958-1796},
+}
+```

__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (153 Bytes). View file

__pycache__/modeling_desta.cpython-310.pyc ADDED Viewed

Binary file (7.53 kB). View file

modeling_desta.py CHANGED Viewed

@@ -98,7 +98,7 @@ class SpeechPerception(PreTrainedModel):
     def generate(self, input_features):
         input_features = input_features.to(self.whisper.device)
-        outputs = self.whisper.generate(inputs=input_features, return_dict_in_generate=True, output_hidden_states=True) # here we use default generate config for whisper
         transcriptions = self.processor.batch_decode(outputs.sequences, skip_special_tokens=True)[0]
         speech_features = self.connector(outputs.encoder_hidden_states)
@@ -109,12 +109,12 @@ class SpeechPerception(PreTrainedModel):
 class DestaModel(PreTrainedModel):
     config_class = Desta2Config
-    def __init__(self, config):
         super().__init__(config)
         self.speech_perception = SpeechPerception(config)
-        self.llama = AutoModelForCausalLM.from_pretrained(config.llama_model_id, torch_dtype=torch.bfloat16)
-        self.tokenizer = AutoTokenizer.from_pretrained(config.llama_model_id)
     def chat(self, messages, max_new_tokens=128, do_sample=True, temperature=0.6, top_p=0.9):
@@ -197,9 +197,9 @@ class DestaModel(PreTrainedModel):
         return audio_path, input_features
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, config=None, cache_dir=None,**kwargs):
         config = cls.config_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        model = cls(config)
         if os.path.isdir(pretrained_model_name_or_path):
             model.speech_perception.connector.load_state_dict(

     def generate(self, input_features):
         input_features = input_features.to(self.whisper.device)
+        outputs = self.whisper.generate(input_features=input_features, return_dict_in_generate=True, output_hidden_states=True) # here we use default generate config for whisper
         transcriptions = self.processor.batch_decode(outputs.sequences, skip_special_tokens=True)[0]
         speech_features = self.connector(outputs.encoder_hidden_states)
 class DestaModel(PreTrainedModel):
     config_class = Desta2Config
+    def __init__(self, config, **kwargs):
         super().__init__(config)
         self.speech_perception = SpeechPerception(config)
+        self.llama = AutoModelForCausalLM.from_pretrained(config.llama_model_id, torch_dtype=torch.bfloat16, **kwargs)
+        self.tokenizer = AutoTokenizer.from_pretrained(config.llama_model_id, **kwargs)
     def chat(self, messages, max_new_tokens=128, do_sample=True, temperature=0.6, top_p=0.9):
         return audio_path, input_features
     @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, config=None,**kwargs):
         config = cls.config_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        model = cls(config, **kwargs)
         if os.path.isdir(pretrained_model_name_or_path):
             model.speech_perception.connector.load_state_dict(