Spaces:
Sleeping
Sleeping
Update streamlit_app.py
Browse files- streamlit_app.py +0 -14
streamlit_app.py
CHANGED
|
@@ -346,7 +346,6 @@ def transcribe_audio(audio_data, whisper_model, whisper_processor):
|
|
| 346 |
# Method 1: Try librosa
|
| 347 |
try:
|
| 348 |
audio_array, sampling_rate = librosa.load(tmp_file_path, sr=16000, dtype=np.float32)
|
| 349 |
-
st.info("✅ Audio loaded with librosa")
|
| 350 |
except Exception as e:
|
| 351 |
st.warning(f"Librosa failed: {e}")
|
| 352 |
|
|
@@ -358,7 +357,6 @@ def transcribe_audio(audio_data, whisper_model, whisper_processor):
|
|
| 358 |
sampling_rate = 16000
|
| 359 |
# Ensure float32 dtype
|
| 360 |
audio_array = audio_array.astype(np.float32)
|
| 361 |
-
st.info("✅ Audio loaded with soundfile")
|
| 362 |
except Exception as e2:
|
| 363 |
st.warning(f"Soundfile failed: {e2}")
|
| 364 |
|
|
@@ -377,7 +375,6 @@ def transcribe_audio(audio_data, whisper_model, whisper_processor):
|
|
| 377 |
if sampling_rate != 16000:
|
| 378 |
audio_array = librosa.resample(audio_array, orig_sr=sampling_rate, target_sr=16000)
|
| 379 |
sampling_rate = 16000
|
| 380 |
-
st.info("✅ Audio loaded with scipy.wavfile")
|
| 381 |
except Exception as e3:
|
| 382 |
st.error(f"All audio loading methods failed: {e3}")
|
| 383 |
# Clean up and return empty
|
|
@@ -422,20 +419,15 @@ def transcribe_audio(audio_data, whisper_model, whisper_processor):
|
|
| 422 |
|
| 423 |
# Ensure float32 dtype
|
| 424 |
audio_array = audio_array.astype(np.float32)
|
| 425 |
-
|
| 426 |
-
st.info(f"📊 Audio processed: {len(audio_array)/16000:.2f}s duration, shape: {audio_array.shape}")
|
| 427 |
|
| 428 |
# Process audio with Whisper
|
| 429 |
try:
|
| 430 |
-
st.info("🔄 Processing with Whisper...")
|
| 431 |
# Try with language parameter first
|
| 432 |
input_features = whisper_processor(
|
| 433 |
audio_array,
|
| 434 |
sampling_rate=16000,
|
| 435 |
return_tensors="pt"
|
| 436 |
).input_features
|
| 437 |
-
|
| 438 |
-
st.info("✅ Audio features extracted successfully")
|
| 439 |
except Exception as proc_error:
|
| 440 |
st.error(f"❌ Failed to process audio features: {proc_error}")
|
| 441 |
return ""
|
|
@@ -449,7 +441,6 @@ def transcribe_audio(audio_data, whisper_model, whisper_processor):
|
|
| 449 |
|
| 450 |
# Generate transcription with error handling
|
| 451 |
try:
|
| 452 |
-
st.info("🔄 Generating transcription...")
|
| 453 |
with torch.no_grad():
|
| 454 |
# Try simple generation first
|
| 455 |
predicted_ids = whisper_model.generate(
|
|
@@ -460,13 +451,11 @@ def transcribe_audio(audio_data, whisper_model, whisper_processor):
|
|
| 460 |
temperature=0.0, # Deterministic
|
| 461 |
use_cache=True
|
| 462 |
)
|
| 463 |
-
st.info("✅ Transcription generated successfully")
|
| 464 |
except RuntimeError as e:
|
| 465 |
st.warning(f"⚠️ First attempt failed: {e}")
|
| 466 |
if "dtype" in str(e).lower() or "float16" in str(e).lower():
|
| 467 |
try:
|
| 468 |
# Try forcing float32 for both input and model
|
| 469 |
-
st.info("🔄 Retrying with float32...")
|
| 470 |
input_features = input_features.float()
|
| 471 |
if hasattr(whisper_model, 'float'):
|
| 472 |
whisper_model = whisper_model.float()
|
|
@@ -479,7 +468,6 @@ def transcribe_audio(audio_data, whisper_model, whisper_processor):
|
|
| 479 |
temperature=0.0,
|
| 480 |
use_cache=True
|
| 481 |
)
|
| 482 |
-
st.info("✅ Transcription generated with float32")
|
| 483 |
except Exception as e2:
|
| 484 |
st.error(f"❌ Float32 attempt failed: {e2}")
|
| 485 |
return ""
|
|
@@ -499,7 +487,6 @@ def transcribe_audio(audio_data, whisper_model, whisper_processor):
|
|
| 499 |
st.warning("⚠️ Transcription is empty. The audio might be silent or unclear.")
|
| 500 |
return ""
|
| 501 |
|
| 502 |
-
st.success(f"✅ Transcription successful: '{transcription[:50]}{'...' if len(transcription) > 50 else ''}'")
|
| 503 |
return transcription
|
| 504 |
|
| 505 |
except Exception as decode_error:
|
|
@@ -1407,7 +1394,6 @@ def main():
|
|
| 1407 |
# Reset states if duplicate message
|
| 1408 |
st.session_state.waiting_for_input = True
|
| 1409 |
st.session_state.processing_complete = True
|
| 1410 |
-
st.warning("⚠️ This audio was already processed. Please record a new message.")
|
| 1411 |
|
| 1412 |
except Exception:
|
| 1413 |
# Fallback to file uploader
|
|
|
|
| 346 |
# Method 1: Try librosa
|
| 347 |
try:
|
| 348 |
audio_array, sampling_rate = librosa.load(tmp_file_path, sr=16000, dtype=np.float32)
|
|
|
|
| 349 |
except Exception as e:
|
| 350 |
st.warning(f"Librosa failed: {e}")
|
| 351 |
|
|
|
|
| 357 |
sampling_rate = 16000
|
| 358 |
# Ensure float32 dtype
|
| 359 |
audio_array = audio_array.astype(np.float32)
|
|
|
|
| 360 |
except Exception as e2:
|
| 361 |
st.warning(f"Soundfile failed: {e2}")
|
| 362 |
|
|
|
|
| 375 |
if sampling_rate != 16000:
|
| 376 |
audio_array = librosa.resample(audio_array, orig_sr=sampling_rate, target_sr=16000)
|
| 377 |
sampling_rate = 16000
|
|
|
|
| 378 |
except Exception as e3:
|
| 379 |
st.error(f"All audio loading methods failed: {e3}")
|
| 380 |
# Clean up and return empty
|
|
|
|
| 419 |
|
| 420 |
# Ensure float32 dtype
|
| 421 |
audio_array = audio_array.astype(np.float32)
|
|
|
|
|
|
|
| 422 |
|
| 423 |
# Process audio with Whisper
|
| 424 |
try:
|
|
|
|
| 425 |
# Try with language parameter first
|
| 426 |
input_features = whisper_processor(
|
| 427 |
audio_array,
|
| 428 |
sampling_rate=16000,
|
| 429 |
return_tensors="pt"
|
| 430 |
).input_features
|
|
|
|
|
|
|
| 431 |
except Exception as proc_error:
|
| 432 |
st.error(f"❌ Failed to process audio features: {proc_error}")
|
| 433 |
return ""
|
|
|
|
| 441 |
|
| 442 |
# Generate transcription with error handling
|
| 443 |
try:
|
|
|
|
| 444 |
with torch.no_grad():
|
| 445 |
# Try simple generation first
|
| 446 |
predicted_ids = whisper_model.generate(
|
|
|
|
| 451 |
temperature=0.0, # Deterministic
|
| 452 |
use_cache=True
|
| 453 |
)
|
|
|
|
| 454 |
except RuntimeError as e:
|
| 455 |
st.warning(f"⚠️ First attempt failed: {e}")
|
| 456 |
if "dtype" in str(e).lower() or "float16" in str(e).lower():
|
| 457 |
try:
|
| 458 |
# Try forcing float32 for both input and model
|
|
|
|
| 459 |
input_features = input_features.float()
|
| 460 |
if hasattr(whisper_model, 'float'):
|
| 461 |
whisper_model = whisper_model.float()
|
|
|
|
| 468 |
temperature=0.0,
|
| 469 |
use_cache=True
|
| 470 |
)
|
|
|
|
| 471 |
except Exception as e2:
|
| 472 |
st.error(f"❌ Float32 attempt failed: {e2}")
|
| 473 |
return ""
|
|
|
|
| 487 |
st.warning("⚠️ Transcription is empty. The audio might be silent or unclear.")
|
| 488 |
return ""
|
| 489 |
|
|
|
|
| 490 |
return transcription
|
| 491 |
|
| 492 |
except Exception as decode_error:
|
|
|
|
| 1394 |
# Reset states if duplicate message
|
| 1395 |
st.session_state.waiting_for_input = True
|
| 1396 |
st.session_state.processing_complete = True
|
|
|
|
| 1397 |
|
| 1398 |
except Exception:
|
| 1399 |
# Fallback to file uploader
|