Spaces:
Running
Running
| /** | |
| * Silero VAD for speech detection and silence trimming | |
| * Based on the approach in ibm-granite/granite-speech HF demo | |
| */ | |
| let vadSession = null; | |
| const VAD_SAMPLE_RATE = 16000; | |
| const VAD_CHUNK_SIZE = 512; // 32ms chunks at 16kHz | |
| const MODEL_CACHE_NAME = 'granite-speech-local-models'; | |
| // Fetch with Cache API persistence | |
| async function cachedFetch(url) { | |
| const cache = await caches.open(MODEL_CACHE_NAME); | |
| const cached = await cache.match(url); | |
| if (cached) return cached; | |
| const response = await fetch(url); | |
| if (response.ok) await cache.put(url, response.clone()); | |
| return response; | |
| } | |
| // Load VAD model | |
| async function loadVAD() { | |
| if (vadSession) return; | |
| console.log('Loading VAD model...'); | |
| const response = await cachedFetch('./silero_vad.onnx'); | |
| const buffer = await response.arrayBuffer(); | |
| vadSession = await ort.InferenceSession.create(buffer, { | |
| executionProviders: ['wasm'], | |
| }); | |
| console.log('VAD model loaded'); | |
| } | |
| // Get speech timestamps using Silero VAD | |
| // Returns list of {start, end} in samples | |
| async function getSpeechTimestamps(audioData, threshold = 0.5) { | |
| await loadVAD(); | |
| // Initialize state [2, 1, 128] | |
| let state = new Float32Array(2 * 1 * 128); | |
| const sr = BigInt(VAD_SAMPLE_RATE); | |
| const speechProbs = []; | |
| // Process in chunks | |
| for (let i = 0; i < audioData.length; i += VAD_CHUNK_SIZE) { | |
| const chunkEnd = Math.min(i + VAD_CHUNK_SIZE, audioData.length); | |
| let chunk = new Float32Array(VAD_CHUNK_SIZE); | |
| // Copy chunk data | |
| for (let j = 0; j < chunkEnd - i; j++) { | |
| chunk[j] = audioData[i + j]; | |
| } | |
| // Run VAD | |
| const inputTensor = new ort.Tensor('float32', chunk, [1, VAD_CHUNK_SIZE]); | |
| const stateTensor = new ort.Tensor('float32', state, [2, 1, 128]); | |
| const srTensor = new ort.Tensor('int64', BigInt64Array.from([sr]), []); | |
| const outputs = await vadSession.run({ | |
| input: inputTensor, | |
| state: stateTensor, | |
| sr: srTensor | |
| }); | |
| speechProbs.push(outputs.output.data[0]); | |
| state = new Float32Array(outputs.stateN.data); | |
| } | |
| // Find speech segments | |
| const segments = []; | |
| let inSpeech = false; | |
| let speechStart = 0; | |
| for (let i = 0; i < speechProbs.length; i++) { | |
| const isSpeech = speechProbs[i] >= threshold; | |
| if (isSpeech && !inSpeech) { | |
| speechStart = i * VAD_CHUNK_SIZE; | |
| inSpeech = true; | |
| } else if (!isSpeech && inSpeech) { | |
| segments.push({ | |
| start: speechStart, | |
| end: i * VAD_CHUNK_SIZE | |
| }); | |
| inSpeech = false; | |
| } | |
| } | |
| if (inSpeech) { | |
| segments.push({ | |
| start: speechStart, | |
| end: audioData.length | |
| }); | |
| } | |
| return segments; | |
| } | |
| // Get speech segments with merging (like granite-speech demo) | |
| // Returns segments with start/end in seconds | |
| async function getSpeechSegments(audioData, sampleRate = VAD_SAMPLE_RATE) { | |
| const vadSegments = await getSpeechTimestamps(audioData); | |
| if (vadSegments.length === 0) { | |
| return [{ start: 0, end: audioData.length / sampleRate }]; | |
| } | |
| // Convert to seconds and apply buffering/merging | |
| const startBuffer = 0.3; // seconds - pad segment start | |
| const minGap = 0.5; // seconds - merge segments with small gaps | |
| const segments = []; | |
| for (const seg of vadSegments) { | |
| const startSec = seg.start / sampleRate; | |
| const endSec = seg.end / sampleRate; | |
| const bufferedStart = Math.max(0, startSec - startBuffer); | |
| if (segments.length > 0 && bufferedStart - segments[segments.length - 1].end < minGap) { | |
| // Merge with previous segment | |
| segments[segments.length - 1].end = endSec; | |
| } else { | |
| // Start new segment | |
| if (segments.length > 0) { | |
| // Extend to previous end to avoid gaps | |
| segments.push({ start: segments[segments.length - 1].end, end: endSec }); | |
| } else { | |
| segments.push({ start: bufferedStart, end: endSec }); | |
| } | |
| } | |
| } | |
| return segments; | |
| } | |
| // Trim silence from audio (simple version - just trim start/end) | |
| async function trimSilence(audioData, sampleRate = VAD_SAMPLE_RATE) { | |
| const segments = await getSpeechTimestamps(audioData); | |
| if (segments.length === 0) { | |
| console.log('VAD: No speech detected, returning original audio'); | |
| return audioData; | |
| } | |
| // Add padding (300ms) | |
| const paddingSamples = Math.floor(0.3 * sampleRate); | |
| const start = Math.max(0, segments[0].start - paddingSamples); | |
| const end = Math.min(audioData.length, segments[segments.length - 1].end + paddingSamples); | |
| const trimmedStart = (start / sampleRate).toFixed(2); | |
| const trimmedEnd = ((audioData.length - end) / sampleRate).toFixed(2); | |
| console.log(`VAD: Trimmed ${trimmedStart}s from start, ${trimmedEnd}s from end`); | |
| return audioData.slice(start, end); | |
| } | |
| // Format timestamp as MM:SS | |
| function formatTimestamp(seconds) { | |
| const mins = Math.floor(seconds / 60); | |
| const secs = Math.floor(seconds % 60); | |
| return `${mins}:${secs.toString().padStart(2, '0')}`; | |
| } | |
| // Export | |
| window.loadVAD = loadVAD; | |
| window.trimSilence = trimSilence; | |
| window.getSpeechSegments = getSpeechSegments; | |
| window.formatTimestamp = formatTimestamp; | |