import gradio as gr import numpy as np import torch import soundfile as sf import librosa from matplotlib import pyplot as plt from transformers import AutoFeatureExtractor, AutoModelForAudioFrameClassification from recitations_segmenter import segment_recitations, clean_speech_intervals import io from PIL import Image import tempfile import os import zipfile # Setup device and model device = 'cuda' if torch.cuda.is_available() else 'cpu' dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32 print(f"Loading model on {device}...") processor = AutoFeatureExtractor.from_pretrained("obadx/recitation-segmenter-v2") model = AutoModelForAudioFrameClassification.from_pretrained( "obadx/recitation-segmenter-v2", torch_dtype=dtype, device_map=device ) print("Model loaded successfully!") def read_audio(path, sampling_rate=16000): """قراءة ملف صوتي وتحويله""" audio, sr = sf.read(path) if len(audio.shape) > 1: audio = audio.mean(axis=1) if sr != sampling_rate: audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate) return torch.tensor(audio).float() def get_interval(x: np.ndarray, intervals: list[list[int]], idx: int, sr=16000, delta=0.3, exact_boundries=False): """استخراج مقطع صوتي من الفواصل""" start = int((intervals[idx][0] - delta) * sr) end = int(intervals[idx][1] * sr) if not exact_boundries: start = 0 if idx == 0 else int((intervals[idx][0] - delta) * sr) end = len(x) if idx == len(intervals) - 1 else int((intervals[idx + 1][0] - delta) * sr) return x[start: end] def plot_signal(x: np.ndarray, intervals: list[list[float]], log_min_count=5, sr=16000): """رسم الإشارة الصوتية مع الفواصل""" fig, ax = plt.subplots(figsize=(20, 4)) if isinstance(x, torch.Tensor): x = x.numpy() ax.plot(x, linewidth=0.5) intervals_flat = np.array(intervals).reshape(-1) diffs = np.diff(intervals_flat) min_silence_diffs_idx = float('-inf') info_text = "" if len(intervals_flat) > 2: silence_diffs = diffs[1: len(diffs): 2] min_silence_diffs_ids = silence_diffs.argsort()[: log_min_count] min_silence_diffs_idx = min_silence_diffs_ids[0] * 2 + 1 info_text += f'Minimum Silence Interval IDs: {min_silence_diffs_ids}\n' info_text += f'Minimum Silence Intervals: {silence_diffs[min_silence_diffs_ids]}\n' speech_diffs = diffs[0: len(diffs): 2] min_speech_diffs_ids = speech_diffs.argsort()[: log_min_count] info_text += f'Minimum Speech Interval IDs: {min_speech_diffs_ids}\n' info_text += f'Minimum Speech Intervals: {speech_diffs[min_speech_diffs_ids]}\n' ymin = x.min() ymax = x.max() for idx, val in enumerate(intervals_flat): color = 'red' if idx in [min_silence_diffs_idx, min_silence_diffs_idx + 1]: color = 'green' ax.axvline(x=val * sr, ymin=0, ymax=1, color=color, alpha=0.6, linewidth=1) ax.set_xlabel('Samples') ax.set_ylabel('Amplitude') ax.set_title('Audio Signal with Detected Intervals') ax.grid(True, alpha=0.3) plt.tight_layout() buf = io.BytesIO() plt.savefig(buf, format='png', dpi=100, bbox_inches='tight') buf.seek(0) img = Image.open(buf) plt.close() return img, info_text def process_audio(audio_file, min_silence_ms, min_speech_ms, pad_ms): """معالجة الملف الصوتي وتقطيعه""" if audio_file is None: return None, "⚠️ من فضلك ارفع ملف صوتي", None, [] try: # قراءة الملف wav = read_audio(audio_file) # تقسيم التلاوة sampled_outputs = segment_recitations( [wav], model, processor, device=device, dtype=dtype, batch_size=4, ) # تنظيف الفواصل clean_out = clean_speech_intervals( sampled_outputs[0].speech_intervals, sampled_outputs[0].is_complete, min_silence_duration_ms=min_silence_ms, min_speech_duration_ms=min_speech_ms, pad_duration_ms=pad_ms, return_seconds=True, ) intervals = clean_out.clean_speech_intervals # رسم الإشارة plot_img, stats_text = plot_signal(wav, intervals) # استخراج المقاطع الصوتية num_segments = len(intervals) result_text = f"✅ تم التقطيع بنجاح!\n\n" result_text += f"📊 عدد المقاطع: {num_segments}\n" result_text += f"⏱️ طول الملف الأصلي: {len(wav)/16000:.2f} ثانية\n\n" result_text += "=" * 50 + "\n" result_text += stats_text result_text += "=" * 50 + "\n\n" # إنشاء مجلد مؤقت للمقاطع temp_dir = tempfile.mkdtemp() segment_files = [] for idx in range(num_segments): audio_seg = get_interval( x=wav, intervals=intervals, idx=idx, delta=0.050, exact_boundries=True ) if isinstance(audio_seg, torch.Tensor): audio_seg = audio_seg.cpu().numpy() duration = len(audio_seg) / 16000 result_text += f"مقطع {idx + 1}: من {intervals[idx][0]:.2f}s إلى {intervals[idx][1]:.2f}s (المدة: {duration:.2f}s)\n" # حفظ المقطع segment_path = os.path.join(temp_dir, f"segment_{idx+1:03d}.wav") sf.write(segment_path, audio_seg, 16000) segment_files.append(segment_path) # إنشاء ملف ZIP zip_path = os.path.join(temp_dir, "segments.zip") with zipfile.ZipFile(zip_path, 'w') as zipf: for seg_file in segment_files: zipf.write(seg_file, os.path.basename(seg_file)) # إنشاء HTML لعرض المقاطع audio_html = "