Spaces:

KNipun
/

Whisper-AI-Psychiatric

Sleeping

App Files Files Community

KNipun commited on Aug 7

Commit

d69e783

verified ·

1 Parent(s): f006986

Update streamlit_app.py

Browse files

Files changed (1) hide show

streamlit_app.py +125 -53

streamlit_app.py CHANGED Viewed

@@ -339,19 +339,59 @@ def transcribe_audio(audio_data, whisper_model, whisper_processor):
                 tmp_file.write(audio_data)
                 tmp_file_path = tmp_file.name
-            # Load audio using librosa
             try:
                 audio_array, sampling_rate = librosa.load(tmp_file_path, sr=16000, dtype=np.float32)
             except Exception as e:
-                # Fallback: try using soundfile
-                audio_array, sampling_rate = sf.read(tmp_file_path)
-                if sampling_rate != 16000:
-                    audio_array = librosa.resample(audio_array, orig_sr=sampling_rate, target_sr=16000)
-                # Ensure float32 dtype
-                audio_array = audio_array.astype(np.float32)
             # Clean up temporary file
-            os.unlink(tmp_file_path)
         else:
             audio_array = audio_data
             sampling_rate = 16000
@@ -359,31 +399,46 @@ def transcribe_audio(audio_data, whisper_model, whisper_processor):
             if hasattr(audio_array, 'astype'):
                 audio_array = audio_array.astype(np.float32)
         # Ensure audio is normalized and in correct format
         if isinstance(audio_array, np.ndarray):
             # Normalize audio to [-1, 1] range if needed
             if np.max(np.abs(audio_array)) > 1.0:
                 audio_array = audio_array / np.max(np.abs(audio_array))
             # Ensure float32 dtype
             audio_array = audio_array.astype(np.float32)
         # Process audio with Whisper
         try:
             # Try with language parameter first
-            input_features = whisper_processor(
-                audio_array,
-                sampling_rate=16000,
-                return_tensors="pt",
-                language="english"  # Set default language to English
-            ).input_features
-        except Exception as proc_error:
-            # Fallback without language parameter
             input_features = whisper_processor(
                 audio_array,
                 sampling_rate=16000,
                 return_tensors="pt"
             ).input_features
         # Get device and model info
         device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -394,53 +449,65 @@ def transcribe_audio(audio_data, whisper_model, whisper_processor):
         # Generate transcription with error handling
         try:
             with torch.no_grad():
-                # Force English language using forced_decoder_ids
-                forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language="english", task="transcribe")
                 predicted_ids = whisper_model.generate(
                     input_features,
                     max_length=448,  # Standard max length for Whisper
                     num_beams=1,     # Faster generation
                     do_sample=False, # Deterministic output
-                    forced_decoder_ids=forced_decoder_ids  # Force English language
                 )
         except RuntimeError as e:
-            if "dtype" in str(e).lower():
-                # Try forcing float32 for both input and model
-                input_features = input_features.float()
-                if torch.cuda.is_available():
-                    whisper_model = whisper_model.float()
-                with torch.no_grad():
-                    forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language="english", task="transcribe")
-                    predicted_ids = whisper_model.generate(
-                        input_features,
-                        max_length=448,
-                        num_beams=1,
-                        do_sample=False,
-                        forced_decoder_ids=forced_decoder_ids  # Force English language
-                    )
             else:
-                raise e
         except Exception as generation_error:
-            # Fallback: try without forced_decoder_ids if it's not supported
-            try:
-                with torch.no_grad():
-                    predicted_ids = whisper_model.generate(
-                        input_features,
-                        max_length=448,
-                        num_beams=1,
-                        do_sample=False
-                    )
-            except Exception as final_error:
-                raise final_error
         # Decode transcription
-        transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-        return transcription.strip()
     except Exception as e:
-        st.error(f"Error transcribing audio: {e}")
         logging.error(f"Transcription error: {e}")
         return ""
@@ -1290,7 +1357,7 @@ def main():
                             )
                             if transcribed_text and transcribed_text != st.session_state.last_processed_message:
                                 st.session_state.last_processed_message = transcribed_text
-                                st.success(f"Transcribed: {transcribed_text}")
                                 # Add transcribed text to chat
                                 st.session_state.messages.append({"role": "user", "content": transcribed_text})
@@ -1331,11 +1398,16 @@ def main():
                                 # Trigger rerun to display the conversation
                                 st.rerun()
                             else:
-                                # Reset states if transcription failed
                                 st.session_state.waiting_for_input = True
                                 st.session_state.processing_complete = True
-                                st.error("Could not transcribe audio. Please try again.")
             except Exception:
                 # Fallback to file uploader
@@ -1408,7 +1480,7 @@ def main():
                                 # Reset states if transcription failed
                                 st.session_state.waiting_for_input = True
                                 st.session_state.processing_complete = True
-                                st.error("Could not transcribe audio. Please try again.")
     # Show ready status when waiting for input
     if st.session_state.waiting_for_input and st.session_state.processing_complete:

                 tmp_file.write(audio_data)
                 tmp_file_path = tmp_file.name
+            # Load audio using multiple fallback methods
+            audio_array = None
+            sampling_rate = 16000
+            # Method 1: Try librosa
             try:
                 audio_array, sampling_rate = librosa.load(tmp_file_path, sr=16000, dtype=np.float32)
+                st.info("✅ Audio loaded with librosa")
             except Exception as e:
+                st.warning(f"Librosa failed: {e}")
+                # Method 2: Try soundfile
+                try:
+                    audio_array, sampling_rate = sf.read(tmp_file_path)
+                    if sampling_rate != 16000:
+                        audio_array = librosa.resample(audio_array, orig_sr=sampling_rate, target_sr=16000)
+                        sampling_rate = 16000
+                    # Ensure float32 dtype
+                    audio_array = audio_array.astype(np.float32)
+                    st.info("✅ Audio loaded with soundfile")
+                except Exception as e2:
+                    st.warning(f"Soundfile failed: {e2}")
+                    # Method 3: Try scipy.io.wavfile
+                    try:
+                        sampling_rate, audio_array = wavfile.read(tmp_file_path)
+                        # Convert to float32 and normalize
+                        if audio_array.dtype == np.int16:
+                            audio_array = audio_array.astype(np.float32) / 32768.0
+                        elif audio_array.dtype == np.int32:
+                            audio_array = audio_array.astype(np.float32) / 2147483648.0
+                        else:
+                            audio_array = audio_array.astype(np.float32)
+                        # Resample if needed
+                        if sampling_rate != 16000:
+                            audio_array = librosa.resample(audio_array, orig_sr=sampling_rate, target_sr=16000)
+                            sampling_rate = 16000
+                        st.info("✅ Audio loaded with scipy.wavfile")
+                    except Exception as e3:
+                        st.error(f"All audio loading methods failed: {e3}")
+                        # Clean up and return empty
+                        try:
+                            os.unlink(tmp_file_path)
+                        except:
+                            pass
+                        return ""
             # Clean up temporary file
+            try:
+                os.unlink(tmp_file_path)
+            except:
+                pass
         else:
             audio_array = audio_data
             sampling_rate = 16000
             if hasattr(audio_array, 'astype'):
                 audio_array = audio_array.astype(np.float32)
+        # Validate audio array
+        if audio_array is None or len(audio_array) == 0:
+            st.error("❌ Audio array is empty or invalid")
+            return ""
         # Ensure audio is normalized and in correct format
         if isinstance(audio_array, np.ndarray):
+            # Handle multi-channel audio by taking the first channel
+            if len(audio_array.shape) > 1:
+                audio_array = audio_array[:, 0]  # Take first channel if stereo
+            # Check minimum audio length (at least 0.5 seconds)
+            min_length = int(0.5 * 16000)  # 0.5 seconds at 16kHz
+            if len(audio_array) < min_length:
+                st.warning("⚠️ Audio is too short (less than 0.5 seconds). Please record a longer audio.")
+                return ""
             # Normalize audio to [-1, 1] range if needed
             if np.max(np.abs(audio_array)) > 1.0:
                 audio_array = audio_array / np.max(np.abs(audio_array))
             # Ensure float32 dtype
             audio_array = audio_array.astype(np.float32)
+            st.info(f"📊 Audio processed: {len(audio_array)/16000:.2f}s duration, shape: {audio_array.shape}")
         # Process audio with Whisper
         try:
+            st.info("🔄 Processing with Whisper...")
             # Try with language parameter first
             input_features = whisper_processor(
                 audio_array,
                 sampling_rate=16000,
                 return_tensors="pt"
             ).input_features
+            st.info("✅ Audio features extracted successfully")
+        except Exception as proc_error:
+            st.error(f"❌ Failed to process audio features: {proc_error}")
+            return ""
         # Get device and model info
         device = "cuda" if torch.cuda.is_available() else "cpu"
         # Generate transcription with error handling
         try:
+            st.info("🔄 Generating transcription...")
             with torch.no_grad():
+                # Try simple generation first
                 predicted_ids = whisper_model.generate(
                     input_features,
                     max_length=448,  # Standard max length for Whisper
                     num_beams=1,     # Faster generation
                     do_sample=False, # Deterministic output
+                    temperature=0.0,  # Deterministic
+                    use_cache=True
                 )
+                st.info("✅ Transcription generated successfully")
         except RuntimeError as e:
+            st.warning(f"⚠️ First attempt failed: {e}")
+            if "dtype" in str(e).lower() or "float16" in str(e).lower():
+                try:
+                    # Try forcing float32 for both input and model
+                    st.info("🔄 Retrying with float32...")
+                    input_features = input_features.float()
+                    if hasattr(whisper_model, 'float'):
+                        whisper_model = whisper_model.float()
+                    with torch.no_grad():
+                        predicted_ids = whisper_model.generate(
+                            input_features,
+                            max_length=448,
+                            num_beams=1,
+                            do_sample=False,
+                            temperature=0.0,
+                            use_cache=True
+                        )
+                    st.info("✅ Transcription generated with float32")
+                except Exception as e2:
+                    st.error(f"❌ Float32 attempt failed: {e2}")
+                    return ""
             else:
+                st.error(f"❌ Generation failed: {e}")
+                return ""
         except Exception as generation_error:
+            st.error(f"❌ Unexpected generation error: {generation_error}")
+            return ""
         # Decode transcription
+        try:
+            transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+            transcription = transcription.strip()
+            if not transcription:
+                st.warning("⚠️ Transcription is empty. The audio might be silent or unclear.")
+                return ""
+            st.success(f"✅ Transcription successful: '{transcription[:50]}{'...' if len(transcription) > 50 else ''}'")
+            return transcription
+        except Exception as decode_error:
+            st.error(f"❌ Failed to decode transcription: {decode_error}")
+            return ""
     except Exception as e:
+        st.error(f"❌ Error transcribing audio: {e}")
         logging.error(f"Transcription error: {e}")
         return ""
                             )
                             if transcribed_text and transcribed_text != st.session_state.last_processed_message:
                                 st.session_state.last_processed_message = transcribed_text
+                                st.success(f"✅ Transcribed: {transcribed_text}")
                                 # Add transcribed text to chat
                                 st.session_state.messages.append({"role": "user", "content": transcribed_text})
                                 # Trigger rerun to display the conversation
                                 st.rerun()
+                            elif not transcribed_text:
+                                # Reset states if transcription failed or returned empty
+                                st.session_state.waiting_for_input = True
+                                st.session_state.processing_complete = True
+                                st.error("❌ Could not transcribe audio. Please ensure the audio is clear and contains speech, then try again.")
                             else:
+                                # Reset states if duplicate message
                                 st.session_state.waiting_for_input = True
                                 st.session_state.processing_complete = True
+                                st.warning("⚠️ This audio was already processed. Please record a new message.")
             except Exception:
                 # Fallback to file uploader
                                 # Reset states if transcription failed
                                 st.session_state.waiting_for_input = True
                                 st.session_state.processing_complete = True
+                                st.error("❌ Could not transcribe audio. Please ensure the audio file is valid and contains speech, then try again.")
     # Show ready status when waiting for input
     if st.session_state.waiting_for_input and st.session_state.processing_complete: