Spaces:

ibm-granite
/

granite-speech-webgpu

Running

App Files Files Community

gsaon commited on Mar 16

Commit

dbc1132

verified ·

1 Parent(s): b7e38a0

Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

README.md +3 -6
app.js +1095 -0
index.html +134 -18
pcs_vocab.json +0 -0
punct_cap_seg_en.onnx +3 -0
punctuator.js +294 -0
silero_vad.onnx +3 -0
style.css +627 -17
vad.js +157 -0

README.md CHANGED Viewed

@@ -1,11 +1,8 @@
 ---
-title: Granite Speech Webgpu
-emoji: 🚀
-colorFrom: pink
 colorTo: indigo
 sdk: static
 pinned: false
-license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Granite Speech WebGPU
+emoji: 🗣️
+colorFrom: blue
 colorTo: indigo
 sdk: static
 pinned: false
 ---

app.js ADDED Viewed

	@@ -0,0 +1,1095 @@

+/**
+ * Granite Speech WebGPU Demo
+ * Uses ONNX Runtime Web for in-browser speech recognition
+ */
+import { PreTrainedTokenizer } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.4.2';
+import { detect } from 'https://cdn.jsdelivr.net/npm/tinyld/+esm';
+// Check if ONNX Runtime is loaded
+if (typeof ort === 'undefined') {
+    console.error('ONNX Runtime Web not loaded! Check if the script tag is correct.');
+    alert('Failed to load ONNX Runtime. Please refresh the page.');
+} else {
+    // Configure WASM paths to CDN
+    ort.env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.24.3/dist/';
+    // WASM settings - enable multi-threading for encoder performance
+    ort.env.wasm.numThreads = navigator.hardwareConcurrency || 4;
+    ort.env.wasm.simd = true;
+    // WebGPU settings
+    ort.env.webgpu = ort.env.webgpu || {};
+}
+// Model paths
+// Granite Speech ONNX models hosted on HF Hub
+const HF_MODEL_BASE = 'https://huggingface.co/ibm-granite/granite-4.0-1b-speech/resolve/main/onnx';
+const ENCODER_PATH = `${HF_MODEL_BASE}/audio_encoder_q4f32.onnx`;
+const EMBED_PATH = `${HF_MODEL_BASE}/embed_tokens_q4f16.onnx`;
+const DECODER_PATH = `${HF_MODEL_BASE}/decoder_model_merged_q4f16.onnx`;
+// Audio config from preprocessor_config.json
+const SAMPLE_RATE = 16000;
+const N_MELS = 80;
+const N_FFT = 512;
+const HOP_LENGTH = 160;
+const WIN_LENGTH = 400;
+// Model config
+const HIDDEN_SIZE = 2048;
+const VOCAB_SIZE = 100353;
+const BOS_TOKEN = 100257;
+const EOS_TOKEN = 100257;
+const PAD_TOKEN = 100256;
+const MAX_NEW_TOKENS = 256;
+// Note: embedding_multiplier (12) is likely already applied in the model weights
+// Prompt templates
+const PROMPT_PREFIX = 'USER: ';
+const PROMPTS = {
+    'transcribe': 'Transcribe the speech to text\n ASSISTANT:',
+    'translate_en': 'Translate the speech to English\n ASSISTANT:',
+    'translate_fr': 'Translate the speech to French\n ASSISTANT:',
+    'translate_de': 'Translate the speech to German\n ASSISTANT:',
+    'translate_es': 'Translate the speech to Spanish\n ASSISTANT:',
+    'translate_pt': 'Translate the speech to Portuguese\n ASSISTANT:',
+    'translate_ja': 'Translate the speech to Japanese\n ASSISTANT:',
+};
+// State
+let encoderSession = null;
+let embedSession = null;
+let decoderSession = null;
+let tokenizer = null;
+let isModelLoading = false;
+let currentAudioData = null;
+// Pre-computed prompt embeddings (populated at init)
+const promptEmbeddings = {
+    prefix: null,  // "USER: "
+    // suffix embeddings keyed by prompt name
+};
+// DOM Elements
+const statusDot = document.getElementById('statusDot');
+const statusText = document.getElementById('statusText');
+const recordBtn = document.getElementById('recordBtn');
+const audioFile = document.getElementById('audioFile');
+const fileTile = document.querySelector('.file-label');
+const inputCard = document.querySelector('.input-card');
+const audioPreview = document.getElementById('audioPreview');
+const audioPlayer = document.getElementById('audioPlayer');
+const playBtn = document.getElementById('playBtn');
+const waveformCanvas = document.getElementById('waveformCanvas');
+const waveformProgress = document.getElementById('waveformProgress');
+const audioTime = document.getElementById('audioTime');
+const transcribeSection = document.getElementById('transcribeSection');
+const transcribeBtn = document.getElementById('transcribeBtn');
+const promptSelect = document.getElementById('promptSelect');
+const punctuationCheckbox = document.getElementById('punctuationCheckbox');
+const transcriptCard = document.getElementById('transcriptCard');
+const outputText = document.getElementById('outputText');
+const copyBtn = document.getElementById('copyBtn');
+const downloadBtn = document.getElementById('downloadBtn');
+const clearBtn = document.getElementById('clearBtn');
+const progressSection = document.getElementById('progressSection');
+const progressFill = document.getElementById('progressFill');
+const progressText = document.getElementById('progressText');
+const gpuInfo = document.getElementById('gpuInfo');
+// Recording state
+let mediaRecorder = null;
+let audioChunks = [];
+let transcriptionAborted = false;
+// Utility functions
+function setStatus(status, message) {
+    statusDot.className = `status-dot ${status}`;
+    statusText.textContent = message;
+}
+// Punctuation is handled by punctuator.js (applyPunctuation function)
+function showProgress(show) {
+    progressSection.style.display = show ? 'block' : 'none';
+}
+function updateProgress(progress, text) {
+    progressFill.style.width = `${progress}%`;
+    progressText.textContent = text;
+}
+// Check WebGPU support
+async function checkWebGPU() {
+    if (!navigator.gpu) {
+        gpuInfo.textContent = 'WebGPU not supported. Use Chrome 113+ or Edge 113+';
+        gpuInfo.style.color = '#e74c3c';
+        return false;
+    }
+    try {
+        const adapter = await navigator.gpu.requestAdapter();
+        if (!adapter) {
+            gpuInfo.textContent = 'No WebGPU adapter available';
+            gpuInfo.style.color = '#f39c12';
+            return false;
+        }
+        return true;
+    } catch (e) {
+        console.error('WebGPU error:', e);
+        gpuInfo.textContent = `WebGPU error: ${e.message || e}`;
+        gpuInfo.style.color = '#e74c3c';
+        return false;
+    }
+}
+// Load tokenizer using transformers.js
+async function loadTokenizer() {
+    const [tokenizerJson, tokenizerConfig] = await Promise.all([
+        fetch('https://huggingface.co/ibm-granite/granite-4.0-1b-speech/resolve/main/tokenizer.json').then(r => r.json()),
+        fetch('https://huggingface.co/ibm-granite/granite-4.0-1b-speech/resolve/main/tokenizer_config.json').then(r => r.json())
+    ]);
+    return new PreTrainedTokenizer(tokenizerJson, tokenizerConfig);
+}
+// Get embeddings for token IDs (returns Float32Array)
+async function getEmbeddings(tokenIds) {
+    const idsTensor = new ort.Tensor('int64', BigInt64Array.from(tokenIds.map(BigInt)), [1, tokenIds.length]);
+    const output = await embedSession.run({ input_ids: idsTensor });
+    return {
+        data: new Float32Array(output.inputs_embeds.data),
+        seqLen: output.inputs_embeds.dims[1]
+    };
+}
+// Pre-compute embeddings for all prompts
+async function precomputePromptEmbeddings() {
+    // Prefix embedding
+    const prefixTokens = tokenizer.encode(PROMPT_PREFIX, { add_special_tokens: false });
+    promptEmbeddings.prefix = await getEmbeddings(prefixTokens);
+    // Suffix embeddings for each prompt
+    for (const [key, text] of Object.entries(PROMPTS)) {
+        const tokens = tokenizer.encode(text, { add_special_tokens: false });
+        promptEmbeddings[key] = await getEmbeddings(tokens);
+    }
+    console.log('Pre-computed embeddings for', Object.keys(promptEmbeddings).length, 'prompts');
+}
+// Session options - WebGPU only (no WASM fallback)
+const sessionOptions = {
+    executionProviders: ['webgpu'],
+    enableMemPattern: false,
+    enableCpuMemArena: false,
+    graphOptimizationLevel: 'basic',
+};
+// Force garbage collection pause
+async function gcPause() {
+    // Give browser time to garbage collect
+    await new Promise(resolve => setTimeout(resolve, 100));
+}
+// Load ONNX model with external data support
+async function loadModelWithExternalData(modelPath, options) {
+    // Check if external data file exists
+    const dataPath = modelPath.replace('.onnx', '.onnx_data');
+    const modelResponse = await fetch(modelPath);
+    const modelBuffer = await modelResponse.arrayBuffer();
+    const dataResponse = await fetch(dataPath);
+    if (!dataResponse.ok) {
+        // No external data, load model directly
+        return await ort.InferenceSession.create(modelBuffer, options);
+    }
+    const dataBuffer = await dataResponse.arrayBuffer();
+    // Extract filename from path for external data reference
+    const dataFileName = dataPath.split('/').pop();
+    // Create session with external data
+    const sessionOptionsWithData = {
+        ...options,
+        externalData: [
+            {
+                path: dataFileName,
+                data: dataBuffer,
+            }
+        ]
+    };
+    return await ort.InferenceSession.create(modelBuffer, sessionOptionsWithData);
+}
+// Initialize ONNX Runtime and load models
+async function initModels() {
+    if (isModelLoading) return;
+    isModelLoading = true;
+    setStatus('loading', 'Loading models...');
+    showProgress(true);
+    try {
+        const hasWebGPU = await checkWebGPU();
+        updateProgress(10, 'Initializing ONNX Runtime...');
+        // Load tokenizer
+        updateProgress(15, 'Loading tokenizer...');
+        tokenizer = await loadTokenizer();
+        // Load models one at a time with GC pauses between
+        // Use loadModelWithExternalData to handle .onnx_data files
+        // Load encoder model (q4f32 with WebGPU)
+        updateProgress(20, 'Loading encoder model...');
+        encoderSession = await loadModelWithExternalData(ENCODER_PATH, sessionOptions);
+        await gcPause();
+        // Load embed tokens model
+        updateProgress(40, 'Loading embed tokens model...');
+        embedSession = await loadModelWithExternalData(EMBED_PATH, sessionOptions);
+        // Pre-compute prompt embeddings
+        updateProgress(50, 'Pre-computing prompt embeddings...');
+        await precomputePromptEmbeddings();
+        await gcPause();
+        // Load decoder model
+        updateProgress(60, 'Loading decoder model...');
+        decoderSession = await loadModelWithExternalData(DECODER_PATH, sessionOptions);
+        updateProgress(100, 'Models loaded!');
+        showProgress(false);
+        setStatus('ready', 'Ready - Record or upload audio');
+        enableControls(true);
+    } catch (error) {
+        console.error('Model loading failed:', error);
+        console.error('Error stack:', error?.stack);
+        const errorMsg = error?.message || error?.toString() || 'Unknown error';
+        setStatus('error', `Error: ${errorMsg}`);
+        showProgress(false);
+        isModelLoading = false;
+    }
+}
+function enableControls(enabled) {
+    recordBtn.disabled = !enabled;
+    audioFile.disabled = !enabled;
+}
+// Mel spectrogram computation
+// Uses custom implementation matching torchaudio
+function computeMelSpectrogram(audioData) {
+    // Pad signal with reflection (center=True, pad_mode='reflect')
+    const padLength = Math.floor(N_FFT / 2);
+    const paddedLength = audioData.length + 2 * padLength;
+    const paddedAudio = new Float32Array(paddedLength);
+    // Reflect padding at start: for position -i, use position i (not i-1)
+    // numpy reflect: for index -1, reflects to index 1
+    for (let i = 0; i < padLength; i++) {
+        // Position -(i+1) reflects to position (i+1)
+        const srcIdx = Math.min(i + 1, audioData.length - 1);
+        paddedAudio[padLength - 1 - i] = audioData[srcIdx];
+    }
+    // Copy original audio
+    for (let i = 0; i < audioData.length; i++) {
+        paddedAudio[padLength + i] = audioData[i];
+    }
+    // Reflect padding at end
+    for (let i = 0; i < padLength; i++) {
+        const srcIdx = Math.max(0, audioData.length - 2 - i);
+        paddedAudio[padLength + audioData.length + i] = audioData[srcIdx];
+    }
+    // Calculate number of frames
+    const numFrames = Math.floor((paddedLength - N_FFT) / HOP_LENGTH) + 1;
+    // Create mel filterbank (torchaudio HTK style)
+    const melFilterbank = createMelFilterbank(N_FFT, N_MELS, SAMPLE_RATE);
+    // Hann window (periodic=True like torchaudio)
+    const window = new Float32Array(WIN_LENGTH);
+    for (let i = 0; i < WIN_LENGTH; i++) {
+        window[i] = 0.5 * (1 - Math.cos(2 * Math.PI * i / WIN_LENGTH));
+    }
+    const melSpec = new Float32Array(numFrames * N_MELS);
+    // torch.stft center-pads the window when win_length < n_fft
+    // Window is placed at indices padLeft to padLeft+win_length
+    const padLeft = Math.floor((N_FFT - WIN_LENGTH) / 2);  // = 56
+    for (let frame = 0; frame < numFrames; frame++) {
+        const start = frame * HOP_LENGTH;
+        // Apply center-padded window (matching torch.stft behavior)
+        // Read n_fft samples, apply window centered in the middle
+        const windowed = new Float32Array(N_FFT);  // initialized to zeros
+        for (let i = 0; i < WIN_LENGTH; i++) {
+            windowed[padLeft + i] = paddedAudio[start + padLeft + i] * window[i];
+        }
+        // Compute power spectrum
+        const powerSpec = computePowerSpectrum(windowed);
+        // Apply mel filterbank and log10
+        for (let m = 0; m < N_MELS; m++) {
+            let sum = 0;
+            for (let k = 0; k < N_FFT / 2 + 1; k++) {
+                sum += powerSpec[k] * melFilterbank[m * (N_FFT / 2 + 1) + k];
+            }
+            melSpec[frame * N_MELS + m] = Math.log10(Math.max(sum, 1e-10));
+        }
+    }
+    return { data: melSpec, numFrames, numMels: N_MELS };
+}
+// Create mel filterbank (torchaudio HTK style)
+function createMelFilterbank(nfft, nMels, sampleRate) {
+    const numBins = nfft / 2 + 1;
+    const filterbank = new Float32Array(nMels * numBins);
+    // HTK mel scale
+    const hzToMel = (hz) => 2595 * Math.log10(1 + hz / 700);
+    const melToHz = (mel) => 700 * (Math.pow(10, mel / 2595) - 1);
+    const fMin = 0;
+    const fMax = sampleRate / 2;
+    const melMin = hzToMel(fMin);
+    const melMax = hzToMel(fMax);
+    // Create mel-spaced frequency points (n_mels + 2 points)
+    const fPts = new Float32Array(nMels + 2);
+    for (let i = 0; i < nMels + 2; i++) {
+        fPts[i] = melToHz(melMin + (melMax - melMin) * i / (nMels + 1));
+    }
+    // Create frequency array for each FFT bin
+    const allFreqs = new Float32Array(numBins);
+    for (let i = 0; i < numBins; i++) {
+        allFreqs[i] = i * sampleRate / nfft;
+    }
+    // Compute frequency differences
+    const fDiff = new Float32Array(nMels + 1);
+    for (let i = 0; i < nMels + 1; i++) {
+        fDiff[i] = fPts[i + 1] - fPts[i];
+    }
+    // Create triangular filters using slopes (torchaudio style)
+    for (let m = 0; m < nMels; m++) {
+        for (let k = 0; k < numBins; k++) {
+            const freq = allFreqs[k];
+            const lowSlope = (freq - fPts[m]) / fDiff[m];
+            const upSlope = (fPts[m + 2] - freq) / fDiff[m + 1];
+            filterbank[m * numBins + k] = Math.max(0, Math.min(lowSlope, upSlope));
+        }
+    }
+    return filterbank;
+}
+// Compute power spectrum using radix-2 FFT
+function computePowerSpectrum(signal) {
+    const n = signal.length;
+    // Use radix-2 FFT for power of 2 lengths
+    if ((n & (n - 1)) === 0) {
+        return computePowerSpectrumFFT(signal);
+    }
+    // Fallback to DFT for non-power-of-2
+    const spectrum = new Float32Array(n / 2 + 1);
+    for (let k = 0; k <= n / 2; k++) {
+        let real = 0, imag = 0;
+        for (let t = 0; t < n; t++) {
+            const angle = -2 * Math.PI * k * t / n;
+            real += signal[t] * Math.cos(angle);
+            imag += signal[t] * Math.sin(angle);
+        }
+        spectrum[k] = real * real + imag * imag;
+    }
+    return spectrum;
+}
+// Radix-2 FFT for power spectrum
+function computePowerSpectrumFFT(signal) {
+    const n = signal.length;
+    // Bit-reversal permutation
+    const real = new Float32Array(n);
+    const imag = new Float32Array(n);
+    for (let i = 0; i < n; i++) {
+        let j = 0;
+        let x = i;
+        for (let k = 0; k < Math.log2(n); k++) {
+            j = (j << 1) | (x & 1);
+            x >>= 1;
+        }
+        real[j] = signal[i];
+    }
+    // Cooley-Tukey FFT
+    for (let size = 2; size <= n; size *= 2) {
+        const halfSize = size / 2;
+        const step = Math.PI / halfSize;
+        for (let i = 0; i < n; i += size) {
+            for (let j = 0; j < halfSize; j++) {
+                const angle = -j * step;
+                const cos = Math.cos(angle);
+                const sin = Math.sin(angle);
+                const idx1 = i + j;
+                const idx2 = i + j + halfSize;
+                const tReal = cos * real[idx2] - sin * imag[idx2];
+                const tImag = sin * real[idx2] + cos * imag[idx2];
+                real[idx2] = real[idx1] - tReal;
+                imag[idx2] = imag[idx1] - tImag;
+                real[idx1] = real[idx1] + tReal;
+                imag[idx1] = imag[idx1] + tImag;
+            }
+        }
+    }
+    // Compute power spectrum (first half + DC and Nyquist)
+    const spectrum = new Float32Array(n / 2 + 1);
+    for (let k = 0; k <= n / 2; k++) {
+        spectrum[k] = real[k] * real[k] + imag[k] * imag[k];
+    }
+    return spectrum;
+}
+// Prepare audio features for encoder
+function prepareAudioFeatures(audioData) {
+    const melSpec = computeMelSpectrogram(audioData);
+    // Apply Granite Speech normalization:
+    // 1. Already have log10 mel from computeMelSpectrogram
+    // 2. Normalize: max(logmel, max - 8) / 4 + 1
+    const logmel = melSpec.data;
+    let maxVal = -Infinity;
+    for (let i = 0; i < logmel.length; i++) {
+        if (logmel[i] > maxVal) maxVal = logmel[i];
+    }
+    const normalized = new Float32Array(logmel.length);
+    for (let i = 0; i < logmel.length; i++) {
+        normalized[i] = Math.max(logmel[i], maxVal - 8) / 4 + 1;
+    }
+    // Remove last frame if odd
+    let numFrames = melSpec.numFrames;
+    if (numFrames % 2 === 1) {
+        numFrames -= 1;
+    }
+    // Stack 2 consecutive frames -> 160 features (80 mels * 2)
+    const stackedFrames = numFrames / 2;
+    const features = new Float32Array(stackedFrames * 160);
+    for (let t = 0; t < stackedFrames; t++) {
+        // First frame of pair
+        for (let m = 0; m < N_MELS; m++) {
+            features[t * 160 + m] = normalized[(t * 2) * N_MELS + m];
+        }
+        // Second frame of pair
+        for (let m = 0; m < N_MELS; m++) {
+            features[t * 160 + N_MELS + m] = normalized[(t * 2 + 1) * N_MELS + m];
+        }
+    }
+    return { data: features, shape: [1, stackedFrames, 160] };
+}
+// Transcribe a single audio segment and return the text
+async function transcribeSegment(audioSegment, onPartialResult) {
+    // Prepare audio features
+    const audioFeatures = prepareAudioFeatures(audioSegment);
+    // Run encoder
+    const encoderInput = new ort.Tensor('float32', audioFeatures.data, audioFeatures.shape);
+    const encoderOutput = await encoderSession.run({ input_features: encoderInput });
+    const audioEmbeddings = encoderOutput.audio_features;
+    // Get pre-computed prompt embeddings
+    const prefixEmbed = promptEmbeddings.prefix;
+    const suffixEmbed = promptEmbeddings[promptSelect.value] || promptEmbeddings['transcribe'];
+    // Concatenate embeddings using TypedArray.set()
+    const prefixSeqLen = prefixEmbed.seqLen;
+    const audioSeqLen = audioEmbeddings.dims[1];
+    const suffixSeqLen = suffixEmbed.seqLen;
+    const totalSeqLen = prefixSeqLen + audioSeqLen + suffixSeqLen;
+    const combinedEmbeds = new Float32Array(totalSeqLen * HIDDEN_SIZE);
+    combinedEmbeds.set(prefixEmbed.data, 0);
+    combinedEmbeds.set(new Float32Array(audioEmbeddings.data), prefixSeqLen * HIDDEN_SIZE);
+    combinedEmbeds.set(suffixEmbed.data, (prefixSeqLen + audioSeqLen) * HIDDEN_SIZE);
+    // Autoregressive generation
+    let generatedTokens = [];
+    let currentEmbeds = combinedEmbeds;
+    let currentSeqLen = totalSeqLen;
+    let pastKeyValues = null;
+    const numLayers = 40;
+    let totalSeqLenSoFar = totalSeqLen;
+    for (let step = 0; step < MAX_NEW_TOKENS; step++) {
+        const attentionMask = new BigInt64Array(totalSeqLenSoFar).fill(1n);
+        const embedsTensor = new ort.Tensor('float32', currentEmbeds, [1, currentSeqLen, HIDDEN_SIZE]);
+        const maskTensor = new ort.Tensor('int64', attentionMask, [1, totalSeqLenSoFar]);
+        const decoderInputs = {
+            inputs_embeds: embedsTensor,
+            attention_mask: maskTensor,
+        };
+        if (pastKeyValues) {
+            for (let i = 0; i < numLayers; i++) {
+                decoderInputs[`past_key_values.${i}.key`] = pastKeyValues[`present.${i}.key`];
+                decoderInputs[`past_key_values.${i}.value`] = pastKeyValues[`present.${i}.value`];
+            }
+        } else {
+            const emptyPast = new Uint16Array(0);
+            for (let i = 0; i < numLayers; i++) {
+                decoderInputs[`past_key_values.${i}.key`] = new ort.Tensor('float16', emptyPast, [1, 4, 0, 128]);
+                decoderInputs[`past_key_values.${i}.value`] = new ort.Tensor('float16', emptyPast, [1, 4, 0, 128]);
+            }
+        }
+        const decoderOutput = await decoderSession.run(decoderInputs);
+        pastKeyValues = decoderOutput;
+        const logitsFloat32 = Float32Array.from(decoderOutput.logits.data);
+        // Extract logits for last position and find argmax
+        const logitOffset = (currentSeqLen - 1) * VOCAB_SIZE;
+        const lastLogits = logitsFloat32.subarray(logitOffset, logitOffset + VOCAB_SIZE);
+        let nextToken = 0, maxVal = lastLogits[0];
+        for (let i = 1; i < VOCAB_SIZE; i++) {
+            if (lastLogits[i] > maxVal) { maxVal = lastLogits[i]; nextToken = i; }
+        }
+        // Avoid EOS on first token - take second best
+        if (step === 0 && nextToken === EOS_TOKEN) {
+            nextToken = 0; maxVal = -Infinity;
+            for (let i = 0; i < VOCAB_SIZE; i++) {
+                if (i !== EOS_TOKEN && lastLogits[i] > maxVal) { maxVal = lastLogits[i]; nextToken = i; }
+            }
+        }
+        if (nextToken === EOS_TOKEN) {
+            break;
+        }
+        generatedTokens.push(nextToken);
+        // Callback for streaming updates
+        if (onPartialResult) {
+            onPartialResult(tokenizer.decode(generatedTokens));
+        }
+        const nextTokenTensor = new ort.Tensor('int64', BigInt64Array.from([BigInt(nextToken)]), [1, 1]);
+        const nextEmbedOutput = await embedSession.run({ input_ids: nextTokenTensor });
+        currentEmbeds = new Float32Array(nextEmbedOutput.inputs_embeds.data);
+        currentSeqLen = 1;
+        totalSeqLenSoFar += 1;
+    }
+    return tokenizer.decode(generatedTokens);
+}
+// Wait until audio playback reaches a specific time
+function waitForPlaybackTime(targetTime) {
+    return new Promise((resolve) => {
+        const check = () => {
+            if (audioPlayer.paused || audioPlayer.currentTime >= targetTime) {
+                resolve();
+            } else {
+                requestAnimationFrame(check);
+            }
+        };
+        check();
+    });
+}
+// Run inference with segmentation and audio sync
+async function transcribe() {
+    if (!encoderSession || !embedSession || !decoderSession || !currentAudioData) {
+        setStatus('error', 'Model or audio not ready');
+        return;
+    }
+    setStatus('processing', 'Processing audio...');
+    transcribeBtn.disabled = true;
+    transcriptionAborted = false;
+    outputText.textContent = '';
+    transcriptCard.style.display = 'block';
+    showProgress(true);
+    try {
+        // Get speech segments using VAD
+        updateProgress(5, 'Detecting speech segments...');
+        const segments = await getSpeechSegments(currentAudioData, SAMPLE_RATE);
+        console.log(`VAD found ${segments.length} segment(s)`);
+        // Start audio playback immediately
+        audioPlayer.currentTime = 0;
+        audioPlayer.play();
+        playBtn.querySelector('.play-icon').style.display = 'none';
+        playBtn.querySelector('.pause-icon').style.display = 'block';
+        const playbackStartTime = performance.now() / 1000;
+        // Process and display segments in sync with audio
+        const displayedResults = [];
+        const totalSegments = segments.length;
+        for (let segIdx = 0; segIdx < totalSegments; segIdx++) {
+            if (transcriptionAborted) break;
+            const seg = segments[segIdx];
+            // Update progress bar
+            const segProgress = ((segIdx + 1) / totalSegments) * 100;
+            updateProgress(segProgress, '');
+            // Wait for audio to reach this segment's start time
+            const elapsed = (performance.now() / 1000) - playbackStartTime;
+            const waitTime = seg.start - elapsed;
+            if (waitTime > 0) {
+                await new Promise(resolve => setTimeout(resolve, waitTime * 1000));
+            }
+            setStatus('processing', `Segment ${segIdx + 1}/${totalSegments}`);
+            // Extract and transcribe this segment
+            const startSample = Math.floor(seg.start * SAMPLE_RATE);
+            const endSample = Math.floor(seg.end * SAMPLE_RATE);
+            const audioSegment = currentAudioData.slice(startSample, endSample);
+            const timestamp = formatTimestamp(seg.start);
+            const makeRow = (ts, text) => `<div class="transcript-row"><span class="timestamp">${ts}</span><span class="transcript-text">${text}</span></div>`;
+            // Transcribe with streaming display
+            const segmentText = await transcribeSegment(audioSegment, (partial) => {
+                const escaped = partial.replace(/</g, '&lt;').replace(/>/g, '&gt;');
+                const rows = [...displayedResults, makeRow(timestamp, escaped)];
+                outputText.innerHTML = rows.join('');
+                outputText.scrollTop = outputText.scrollHeight;
+            });
+            if (segmentText.trim()) {
+                let finalSegmentText = segmentText.trim();
+                // Auto-detect language and apply punctuation if supported
+                if (punctuationCheckbox.checked) {
+                    const detectedLang = detect(finalSegmentText);
+                    const supportedLangs = ['en', ...window.MULTILINGUAL_PUNCT_LANGS];
+                    if (supportedLangs.includes(detectedLang)) {
+                        // Strip existing punctuation before applying punctuation model
+                        const stripped = finalSegmentText.replace(/[.,!?]/g, ' ').replace(/\s+/g, ' ').trim();
+                        finalSegmentText = await applyPunctuation(stripped, detectedLang);
+                        // Replace unknown tokens from punctuator with spaces
+                        finalSegmentText = finalSegmentText.replace(/<unk>/gi, ' ').replace(/\s+/g, ' ').trim();
+                    }
+                }
+                const escaped = finalSegmentText.replace(/</g, '&lt;').replace(/>/g, '&gt;');
+                displayedResults.push(makeRow(timestamp, escaped));
+                outputText.innerHTML = displayedResults.join('');
+                outputText.scrollTop = outputText.scrollHeight;
+            }
+        }
+        // Final output
+        if (displayedResults.length === 0) {
+            outputText.innerHTML = '<span style="color: #94a3b8;">(No speech detected)</span>';
+        }
+        copyBtn.disabled = false;
+        showProgress(false);
+        setStatus('ready', 'Transcription complete');
+    } catch (error) {
+        console.error('Transcription failed:', error);
+        setStatus('error', `Error: ${error.message}`);
+        showProgress(false);
+    }
+    transcribeBtn.disabled = false;
+}
+// Audio recording
+let isRecording = false;
+function toggleRecording() {
+    if (isRecording) {
+        stopRecording();
+    } else {
+        startRecording();
+    }
+}
+async function startRecording() {
+    try {
+        const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+        mediaRecorder = new MediaRecorder(stream);
+        audioChunks = [];
+        mediaRecorder.ondataavailable = (event) => {
+            audioChunks.push(event.data);
+        };
+        mediaRecorder.onstop = async () => {
+            const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
+            const audioUrl = URL.createObjectURL(audioBlob);
+            audioPlayer.src = audioUrl;
+            audioPreview.style.display = 'flex';
+            transcribeSection.style.display = 'flex';
+            await processAudioBlob(audioBlob);
+            drawWaveform();
+            updateAudioTime();
+            stream.getTracks().forEach(track => track.stop());
+        };
+        mediaRecorder.start();
+        isRecording = true;
+        setStatus('recording', 'Recording...');
+        // Update button UI
+        recordBtn.querySelector('.mic-icon').style.display = 'none';
+        recordBtn.querySelector('.stop-icon').style.display = 'block';
+        recordBtn.querySelector('span').textContent = 'Stop';
+        recordBtn.classList.add('recording');
+    } catch (error) {
+        console.error('Recording failed:', error);
+        setStatus('error', 'Microphone access denied');
+    }
+}
+function stopRecording() {
+    if (mediaRecorder && mediaRecorder.state !== 'inactive') {
+        mediaRecorder.stop();
+        isRecording = false;
+        setStatus('ready', 'Recording stopped - Click Transcribe');
+        // Update button UI
+        recordBtn.querySelector('.mic-icon').style.display = 'block';
+        recordBtn.querySelector('.stop-icon').style.display = 'none';
+        recordBtn.querySelector('span').textContent = 'Record';
+        recordBtn.classList.remove('recording');
+    }
+}
+// Process audio file/blob
+async function processAudioBlob(blob) {
+    try {
+        const arrayBuffer = await blob.arrayBuffer();
+        const audioCtx = new AudioContext({ sampleRate: SAMPLE_RATE });
+        const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);
+        // Convert to mono Float32Array
+        let audioData;
+        if (audioBuffer.numberOfChannels > 1) {
+            const left = audioBuffer.getChannelData(0);
+            const right = audioBuffer.getChannelData(1);
+            audioData = new Float32Array(left.length);
+            for (let i = 0; i < left.length; i++) {
+                audioData[i] = (left[i] + right[i]) / 2;
+            }
+        } else {
+            audioData = audioBuffer.getChannelData(0);
+        }
+        // Resample if needed
+        if (audioBuffer.sampleRate !== SAMPLE_RATE) {
+            audioData = resample(audioData, audioBuffer.sampleRate, SAMPLE_RATE);
+        }
+        currentAudioData = audioData;
+        transcribeBtn.disabled = false;
+    } catch (error) {
+        console.error('Audio processing failed:', error);
+        setStatus('error', 'Failed to process audio');
+    }
+}
+// Simple linear resampling
+function resample(audioData, fromRate, toRate) {
+    const ratio = fromRate / toRate;
+    const newLength = Math.round(audioData.length / ratio);
+    const result = new Float32Array(newLength);
+    for (let i = 0; i < newLength; i++) {
+        const srcIndex = i * ratio;
+        const srcIndexFloor = Math.floor(srcIndex);
+        const srcIndexCeil = Math.min(srcIndexFloor + 1, audioData.length - 1);
+        const t = srcIndex - srcIndexFloor;
+        result[i] = audioData[srcIndexFloor] * (1 - t) + audioData[srcIndexCeil] * t;
+    }
+    return result;
+}
+// Handle file upload
+async function handleFileUpload(event) {
+    const file = event.target.files[0];
+    if (!file) return;
+    await loadAudioFile(file);
+}
+// Handle dropped files
+async function handleFileDrop(event) {
+    event.preventDefault();
+    inputCard.classList.remove('drag-over');
+    const file = event.dataTransfer.files[0];
+    if (!file || !file.type.startsWith('audio/')) {
+        setStatus('error', 'Please drop an audio file');
+        return;
+    }
+    await loadAudioFile(file);
+}
+// Common file loading logic
+async function loadAudioFile(file) {
+    setStatus('processing', 'Processing audio file...');
+    const audioUrl = URL.createObjectURL(file);
+    audioPlayer.src = audioUrl;
+    audioPreview.style.display = 'flex';
+    transcribeSection.style.display = 'flex';
+    await processAudioBlob(file);
+    drawWaveform();
+    updateAudioTime();
+    setStatus('ready', 'Audio loaded - Click Transcribe');
+}
+// Draw waveform visualization
+function drawWaveform() {
+    if (!currentAudioData) return;
+    const canvas = waveformCanvas;
+    const ctx = canvas.getContext('2d');
+    const dpr = window.devicePixelRatio || 1;
+    // Set canvas size
+    const rect = canvas.getBoundingClientRect();
+    canvas.width = rect.width * dpr;
+    canvas.height = rect.height * dpr;
+    ctx.scale(dpr, dpr);
+    const width = rect.width;
+    const height = rect.height;
+    const centerY = height / 2;
+    // Downsample audio data for visualization
+    const samples = currentAudioData;
+    const barCount = Math.floor(width / 3);
+    const samplesPerBar = Math.floor(samples.length / barCount);
+    // Calculate bar amplitudes
+    const barAmplitudes = [];
+    for (let i = 0; i < barCount; i++) {
+        let sum = 0;
+        const start = i * samplesPerBar;
+        for (let j = 0; j < samplesPerBar; j++) {
+            sum += Math.abs(samples[start + j] || 0);
+        }
+        barAmplitudes.push(sum / samplesPerBar);
+    }
+    // Find max amplitude for normalization
+    const maxAmp = Math.max(...barAmplitudes, 0.01);
+    // Get color based on color scheme
+    const isDark = window.matchMedia('(prefers-color-scheme: dark)').matches;
+    ctx.fillStyle = isDark ? '#64748b' : '#cbd5e1';
+    // Draw bars normalized to fill height
+    for (let i = 0; i < barCount; i++) {
+        const normalized = barAmplitudes[i] / maxAmp;
+        const barHeight = Math.max(2, normalized * height * 0.9);
+        ctx.fillRect(i * 3, centerY - barHeight / 2, 2, barHeight);
+    }
+}
+// Format time as M:SS
+function formatTime(seconds) {
+    const mins = Math.floor(seconds / 60);
+    const secs = Math.floor(seconds % 60);
+    return `${mins}:${secs.toString().padStart(2, '0')}`;
+}
+// Update audio time display
+function updateAudioTime() {
+    const current = audioPlayer.currentTime || 0;
+    const duration = audioPlayer.duration || 0;
+    if (duration > 0) {
+        audioTime.textContent = `${formatTime(current)} / ${formatTime(duration)}`;
+        waveformProgress.style.width = `${(current / duration) * 100}%`;
+    } else {
+        audioTime.textContent = formatTime(currentAudioData ? currentAudioData.length / SAMPLE_RATE : 0);
+    }
+}
+// Toggle play/pause
+function togglePlayback() {
+    if (audioPlayer.paused) {
+        audioPlayer.play();
+        playBtn.querySelector('.play-icon').style.display = 'none';
+        playBtn.querySelector('.pause-icon').style.display = 'block';
+    } else {
+        audioPlayer.pause();
+        playBtn.querySelector('.play-icon').style.display = 'block';
+        playBtn.querySelector('.pause-icon').style.display = 'none';
+        // Stop transcription if running
+        if (!transcriptionAborted && transcribeBtn.disabled) {
+            transcriptionAborted = true;
+            showProgress(false);
+            setStatus('ready', 'Transcription stopped');
+            transcribeBtn.disabled = false;
+        }
+    }
+}
+// Seek in audio
+function seekAudio(event) {
+    const rect = waveformCanvas.getBoundingClientRect();
+    const x = event.clientX - rect.left;
+    const percent = x / rect.width;
+    audioPlayer.currentTime = percent * audioPlayer.duration;
+    updateAudioTime();
+}
+// Copy to clipboard
+async function copyToClipboard() {
+    try {
+        await navigator.clipboard.writeText(outputText.textContent);
+        // Brief visual feedback via title attribute
+        const originalTitle = copyBtn.title;
+        copyBtn.title = 'Copied!';
+        setTimeout(() => {
+            copyBtn.title = originalTitle;
+        }, 2000);
+    } catch (error) {
+        console.error('Copy failed:', error);
+    }
+}
+function downloadTranscript() {
+    // Convert <br> to newlines and strip other HTML
+    const text = outputText.innerHTML
+        .replace(/<br\s*\/?>/gi, '\n')
+        .replace(/<[^>]+>/g, '');
+    if (!text) return;
+    const blob = new Blob([text], { type: 'text/plain' });
+    const url = URL.createObjectURL(blob);
+    const a = document.createElement('a');
+    a.href = url;
+    a.download = 'transcript.txt';
+    document.body.appendChild(a);
+    a.click();
+    document.body.removeChild(a);
+    URL.revokeObjectURL(url);
+    // Brief visual feedback
+    const originalTitle = downloadBtn.title;
+    downloadBtn.title = 'Downloaded!';
+    setTimeout(() => {
+        downloadBtn.title = originalTitle;
+    }, 2000);
+}
+function clearAudio() {
+    // Stop any playback
+    audioPlayer.pause();
+    audioPlayer.src = '';
+    // Reset audio state
+    currentAudioData = null;
+    // Hide audio player and transcribe section
+    audioPreview.style.display = 'none';
+    transcribeSection.style.display = 'none';
+    // Clear transcript
+    transcriptCard.style.display = 'none';
+    outputText.textContent = '';
+    // Reset waveform
+    waveformProgress.style.width = '0%';
+    const ctx = waveformCanvas.getContext('2d');
+    ctx.clearRect(0, 0, waveformCanvas.width, waveformCanvas.height);
+    // Reset time display
+    audioTime.textContent = '0:00';
+    // Reset buttons
+    transcribeBtn.disabled = true;
+    // Reset file input
+    audioFile.value = '';
+    // Update status
+    setStatus('ready', 'Ready');
+}
+// Event listeners
+recordBtn.addEventListener('click', toggleRecording);
+audioFile.addEventListener('change', handleFileUpload);
+// Audio player controls
+playBtn.addEventListener('click', togglePlayback);
+waveformCanvas.addEventListener('click', seekAudio);
+audioPlayer.addEventListener('timeupdate', updateAudioTime);
+audioPlayer.addEventListener('ended', () => {
+    playBtn.querySelector('.play-icon').style.display = 'block';
+    playBtn.querySelector('.pause-icon').style.display = 'none';
+    waveformProgress.style.width = '0%';
+});
+// Redraw waveform on resize
+window.addEventListener('resize', drawWaveform);
+transcribeBtn.addEventListener('click', transcribe);
+copyBtn.addEventListener('click', copyToClipboard);
+downloadBtn.addEventListener('click', downloadTranscript);
+clearBtn.addEventListener('click', clearAudio);
+// Drag and drop on input card
+inputCard.addEventListener('dragover', (e) => {
+    e.preventDefault();
+    inputCard.classList.add('drag-over');
+});
+inputCard.addEventListener('dragleave', (e) => {
+    e.preventDefault();
+    inputCard.classList.remove('drag-over');
+});
+inputCard.addEventListener('drop', handleFileDrop);
+// Initialize on load
+window.addEventListener('load', initModels);

index.html CHANGED Viewed

@@ -1,19 +1,135 @@
-<!doctype html>
-<html>
-	<head>
-		<meta charset="utf-8" />
-		<meta name="viewport" content="width=device-width" />
-		<title>My static Space</title>
-		<link rel="stylesheet" href="style.css" />
-	</head>
-	<body>
-		<div class="card">
-			<h1>Welcome to your static Space!</h1>
-			<p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
-			<p>
-				Also don't forget to check the
-				<a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
-			</p>
-		</div>
-	</body>
 </html>

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Granite Speech WebGPU</title>
+    <link rel="stylesheet" href="style.css?v=3">
+</head>
+<body>
+    <div class="container">
+        <!-- Header -->
+        <h1>Granite Speech WebGPU</h1>
+        <h2>Speech recognition and translation directly in your browser</h2>
+        <!-- Audio Input Card -->
+        <div class="input-card">
+            <div class="input-options">
+                <button id="recordBtn" class="input-tile" disabled>
+                    <svg class="mic-icon" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor">
+                        <path stroke-linecap="round" stroke-linejoin="round" d="M12 18.75a6 6 0 006-6v-1.5m-6 7.5a6 6 0 01-6-6v-1.5m6 7.5v3.75m-3.75 0h7.5M12 15.75a3 3 0 01-3-3V4.5a3 3 0 116 0v8.25a3 3 0 01-3 3z" />
+                    </svg>
+                    <svg class="stop-icon" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor" style="display: none;">
+                        <rect x="6" y="6" width="12" height="12" rx="1" />
+                    </svg>
+                    <span>Record</span>
+                </button>
+                <div class="divider"></div>
+                <label class="input-tile file-label">
+                    <svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor">
+                        <path stroke-linecap="round" stroke-linejoin="round" d="M3.75 9.776c.112-.017.227-.026.344-.026h15.812c.117 0 .232.009.344.026m-16.5 0a2.25 2.25 0 00-1.883 2.542l.857 6a2.25 2.25 0 002.227 1.932H19.05a2.25 2.25 0 002.227-1.932l.857-6a2.25 2.25 0 00-1.883-2.542m-16.5 0V6A2.25 2.25 0 016 3.75h3.879a1.5 1.5 0 011.06.44l2.122 2.12a1.5 1.5 0 001.06.44H18A2.25 2.25 0 0120.25 9v.776" />
+                    </svg>
+                    <span>Upload</span>
+                    <input type="file" id="audioFile" accept="audio/*" hidden>
+                </label>
+            </div>
+            <div class="progress-bar">
+                <div class="progress-fill" id="progressFill"></div>
+            </div>
+        </div>
+        <!-- Status -->
+        <div class="status-section" id="statusSection">
+            <span class="status-dot" id="statusDot"></span>
+            <span id="statusText">Loading...</span>
+        </div>
+        <!-- Audio Player (hidden initially) -->
+        <div class="audio-player" id="audioPreview" style="display: none;">
+            <button class="play-btn" id="playBtn">
+                <svg class="play-icon" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="currentColor">
+                    <path d="M8 5v14l11-7z"/>
+                </svg>
+                <svg class="pause-icon" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="currentColor" style="display: none;">
+                    <path d="M6 19h4V5H6v14zm8-14v14h4V5h-4z"/>
+                </svg>
+            </button>
+            <div class="waveform-container">
+                <canvas id="waveformCanvas"></canvas>
+                <div class="waveform-progress" id="waveformProgress"></div>
+            </div>
+            <span class="audio-time" id="audioTime">0:00</span>
+            <button class="clear-btn" id="clearBtn" title="Clear">
+                <svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor">
+                    <path stroke-linecap="round" stroke-linejoin="round" d="M6 18L18 6M6 6l12 12" />
+                </svg>
+            </button>
+            <audio id="audioPlayer" style="display: none;"></audio>
+        </div>
+        <!-- Transcribe Section -->
+        <div class="transcribe-section" id="transcribeSection" style="display: none;">
+            <div class="task-row">
+                <label for="promptSelect">Task:</label>
+                <select id="promptSelect">
+                    <option value="transcribe">Transcribe</option>
+                    <option value="translate_en">Translate to English</option>
+                    <option value="translate_fr">Translate to French</option>
+                    <option value="translate_de">Translate to German</option>
+                    <option value="translate_es">Translate to Spanish</option>
+                    <option value="translate_pt">Translate to Portuguese</option>
+                    <option value="translate_ja">Translate to Japanese</option>
+                </select>
+            </div>
+            <label class="checkbox-row">
+                <input type="checkbox" id="punctuationCheckbox" checked>
+                <span>Add punctuation (English only)</span>
+            </label>
+            <button id="transcribeBtn" class="transcribe-btn" disabled>
+                Transcribe
+            </button>
+        </div>
+        <!-- Model Loading Progress -->
+        <div class="model-progress" id="progressSection" style="display: none;">
+            <span id="progressText">Loading model...</span>
+        </div>
+        <!-- Transcript Output -->
+        <div class="transcript-card" id="transcriptCard" style="display: none;">
+            <div class="transcript-header">
+                <span>Transcript</span>
+                <div class="transcript-actions">
+                    <button id="copyBtn" class="icon-btn" title="Copy">
+                        <svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor">
+                            <path stroke-linecap="round" stroke-linejoin="round" d="M15.666 3.888A2.25 2.25 0 0013.5 2.25h-3c-1.03 0-1.9.693-2.166 1.638m7.332 0c.055.194.084.4.084.612v0a.75.75 0 01-.75.75H9.75a.75.75 0 01-.75-.75v0c0-.212.03-.418.084-.612m7.332 0c.646.049 1.288.11 1.927.184 1.1.128 1.907 1.077 1.907 2.185V19.5a2.25 2.25 0 01-2.25 2.25H6.75A2.25 2.25 0 014.5 19.5V6.257c0-1.108.806-2.057 1.907-2.185a48.208 48.208 0 011.927-.184" />
+                        </svg>
+                    </button>
+                    <button id="downloadBtn" class="icon-btn" title="Download">
+                        <svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor">
+                            <path stroke-linecap="round" stroke-linejoin="round" d="M3 16.5v2.25A2.25 2.25 0 005.25 21h13.5A2.25 2.25 0 0021 18.75V16.5M16.5 12L12 16.5m0 0L7.5 12m4.5 4.5V3" />
+                        </svg>
+                    </button>
+                </div>
+            </div>
+            <div class="transcript-output" id="outputText"></div>
+        </div>
+        <!-- Footer -->
+        <div class="footer">
+            Made with
+            <a href="https://huggingface.co/ibm-granite/granite-4.0-1b-speech" target="_blank">Granite Speech 4.0 1B</a>
+            and
+            <a href="https://onnxruntime.ai/docs/tutorials/web/" target="_blank">ONNX Runtime Web</a>
+            <br>
+            <span class="privacy-note">Your audio and transcription never leave your device</span>
+        </div>
+        <div class="gpu-info" id="gpuInfo"></div>
+    </div>
+    <script src="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.24.3/dist/ort.all.min.js"></script>
+    <script src="vad.js?v=1"></script>
+    <script src="punctuator.js?v=3"></script>
+    <script type="module" src="app.js?v=53"></script>
+</body>
 </html>

pcs_vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

punct_cap_seg_en.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd922d459da618cd324280889740608b76fb3e9e61d3f402291be1251f91421b
+size 209532928

punctuator.js ADDED Viewed

	@@ -0,0 +1,294 @@

+/**
+ * Punctuation and Capitalization using ONNX
+ * - English: Full punctuation + capitalization (1-800-BAD-CODE model)
+ * - Other languages (DE, FR, IT, NL, ES, PT): Punctuation only (oliverguhr multilingual model)
+ */
+// English model (punctuation + capitalization)
+let pcsSession = null;
+let pcsVocab = null;
+let pcsVocabReverse = null;
+// Multilingual model (punctuation only)
+let multilingualSession = null;
+let multilingualTokenizer = null;
+const PCS_CONFIG = {
+    preLabels: ["<NULL>", "¿"],
+    postLabels: ["<NULL>", "<ACRONYM>", ".", ",", "?"],
+    unkId: 0,
+    bosId: 1,
+    eosId: 2,
+    padId: 3,
+};
+// Multilingual model label mapping
+const MULTILINGUAL_LABELS = {
+    0: "",      // No punctuation
+    1: ".",     // Period
+    2: ",",     // Comma
+    3: "?",     // Question mark
+    4: "-",     // Hyphen
+    5: ":",     // Colon
+};
+// Languages supported by multilingual model
+const MULTILINGUAL_LANGS = ['de', 'fr', 'it', 'nl', 'es', 'pt'];
+// Load the English punctuator model and vocab
+async function loadEnglishPunctuator() {
+    if (pcsSession) return;
+    console.log('Loading English punctuator model...');
+    // Load vocab
+    const vocabResponse = await fetch('./pcs_vocab.json');
+    const vocabData = await vocabResponse.json();
+    pcsVocab = vocabData.vocab;
+    // Create reverse vocab (id -> piece)
+    pcsVocabReverse = {};
+    for (const [piece, id] of Object.entries(pcsVocab)) {
+        pcsVocabReverse[id] = piece;
+    }
+    // Load ONNX model
+    pcsSession = await ort.InferenceSession.create('./punct_cap_seg_en.onnx', {
+        executionProviders: ['wasm'],
+    });
+    console.log('English punctuator model loaded');
+}
+// Load the multilingual punctuator model
+async function loadMultilingualPunctuator() {
+    if (multilingualSession) return;
+    console.log('Loading multilingual punctuator model...');
+    // Load tokenizer from transformers.js
+    const { AutoTokenizer } = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.4.2');
+    multilingualTokenizer = await AutoTokenizer.from_pretrained('oliverguhr/fullstop-punctuation-multilingual-base');
+    // Load ONNX model
+    multilingualSession = await ort.InferenceSession.create('./punct_multilingual_q8.onnx', {
+        executionProviders: ['wasm'],
+    });
+    console.log('Multilingual punctuator model loaded');
+}
+// Simple Unigram tokenizer for English model (greedy longest match)
+function tokenizeEnglish(text) {
+    const normalized = text.toLowerCase().replace(/ /g, '▁');
+    const tokens = [];
+    let i = 0;
+    // Add BOS
+    tokens.push(PCS_CONFIG.bosId);
+    // Prepend ▁ for first word
+    let remaining = '▁' + normalized;
+    while (remaining.length > 0) {
+        let found = false;
+        // Try longest match first
+        for (let len = Math.min(remaining.length, 20); len > 0; len--) {
+            const piece = remaining.substring(0, len);
+            if (pcsVocab[piece] !== undefined) {
+                tokens.push(pcsVocab[piece]);
+                remaining = remaining.substring(len);
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
+            // Unknown character, use UNK and skip
+            tokens.push(PCS_CONFIG.unkId);
+            remaining = remaining.substring(1);
+        }
+    }
+    // Add EOS
+    tokens.push(PCS_CONFIG.eosId);
+    return tokens;
+}
+// Apply punctuation and capitalization for English
+async function applyEnglishPunctuation(text) {
+    await loadEnglishPunctuator();
+    // Tokenize
+    const tokenIds = tokenizeEnglish(text);
+    // Run inference
+    const inputTensor = new ort.Tensor('int64', BigInt64Array.from(tokenIds.map(BigInt)), [1, tokenIds.length]);
+    const outputs = await pcsSession.run({ input_ids: inputTensor });
+    const prePreds = outputs.pre_preds.data;
+    const postPreds = outputs.post_preds.data;
+    const capPreds = outputs.cap_preds.data;
+    const segPreds = outputs.seg_preds.data;
+    // Decode: skip BOS (index 0) and EOS (last index)
+    const numTokens = tokenIds.length - 2;
+    const result = [];
+    let currentSentence = [];
+    for (let i = 0; i < numTokens; i++) {
+        const tokenId = tokenIds[i + 1];
+        const token = pcsVocabReverse[tokenId] || '';
+        const outputIdx = i + 1;
+        // Handle word boundary
+        if (token.startsWith('▁') && currentSentence.length > 0) {
+            currentSentence.push(' ');
+        }
+        // Process each character in token
+        const charStart = token.startsWith('▁') ? 1 : 0;
+        for (let j = charStart; j < token.length; j++) {
+            let char = token[j];
+            // Pre-punctuation (e.g., inverted question mark)
+            if (j === charStart && prePreds[outputIdx] === 1) {
+                currentSentence.push(PCS_CONFIG.preLabels[1]);
+            }
+            // Capitalization - capPreds is [batch, seq, 16]
+            const capOffset = outputIdx * 16 + j;
+            if (capPreds[capOffset]) {
+                char = char.toUpperCase();
+            }
+            currentSentence.push(char);
+            // Post-punctuation
+            const postLabel = postPreds[outputIdx];
+            if (postLabel === 1) {  // ACRONYM
+                currentSentence.push('.');
+            } else if (j === token.length - 1 && postLabel > 1) {
+                currentSentence.push(PCS_CONFIG.postLabels[postLabel]);
+            }
+        }
+        // Sentence boundary
+        if (segPreds[outputIdx]) {
+            result.push(currentSentence.join(''));
+            currentSentence = [];
+        }
+    }
+    if (currentSentence.length > 0) {
+        result.push(currentSentence.join(''));
+    }
+    return result.join(' ');
+}
+// Apply punctuation only for other languages (multilingual model)
+async function applyMultilingualPunctuation(text) {
+    await loadMultilingualPunctuator();
+    // Tokenize using transformers.js tokenizer
+    const encoded = await multilingualTokenizer(text, {
+        return_tensors: false,
+        padding: false,
+        truncation: true,
+        max_length: 512,
+    });
+    const inputIds = encoded.input_ids;
+    const attentionMask = encoded.attention_mask;
+    // Run inference
+    const inputIdsTensor = new ort.Tensor('int64', BigInt64Array.from(inputIds.map(BigInt)), [1, inputIds.length]);
+    const attentionMaskTensor = new ort.Tensor('int64', BigInt64Array.from(attentionMask.map(BigInt)), [1, attentionMask.length]);
+    const outputs = await multilingualSession.run({
+        input_ids: inputIdsTensor,
+        attention_mask: attentionMaskTensor,
+    });
+    const logits = outputs.logits.data;
+    const numLabels = 6;
+    // Get predictions (argmax over logits)
+    const predictions = [];
+    for (let i = 0; i < inputIds.length; i++) {
+        let maxIdx = 0;
+        let maxVal = logits[i * numLabels];
+        for (let j = 1; j < numLabels; j++) {
+            if (logits[i * numLabels + j] > maxVal) {
+                maxVal = logits[i * numLabels + j];
+                maxIdx = j;
+            }
+        }
+        predictions.push(maxIdx);
+    }
+    // Decode tokens back to text with punctuation
+    const tokens = multilingualTokenizer.model.convert_ids_to_tokens(inputIds);
+    const result = [];
+    for (let i = 0; i < tokens.length; i++) {
+        const token = tokens[i];
+        // Skip special tokens
+        if (token === '<s>' || token === '</s>' || token === '<pad>') {
+            continue;
+        }
+        // Handle subword tokens (▁ prefix indicates start of new word)
+        if (token.startsWith('▁')) {
+            if (result.length > 0) {
+                result.push(' ');
+            }
+            result.push(token.substring(1));
+        } else {
+            result.push(token);
+        }
+        // Add punctuation after token
+        const punct = MULTILINGUAL_LABELS[predictions[i]];
+        if (punct) {
+            result.push(punct);
+        }
+    }
+    return result.join('');
+}
+// Main entry point - routes to appropriate model based on language
+async function applyPunctuation(text, lang = null) {
+    if (!text || text.trim().length === 0) return text;
+    // If language specified and supported by multilingual model, use it
+    if (lang && MULTILINGUAL_LANGS.includes(lang)) {
+        try {
+            return await applyMultilingualPunctuation(text);
+        } catch (error) {
+            console.warn('Multilingual punctuation failed, returning original:', error);
+            return text;
+        }
+    }
+    // Default to English model
+    try {
+        return await applyEnglishPunctuation(text);
+    } catch (error) {
+        console.warn('English punctuation failed, returning original:', error);
+        return text;
+    }
+}
+// Preload English model (called during init)
+async function loadPunctuator() {
+    await loadEnglishPunctuator();
+}
+// Export for use in app.js
+window.applyPunctuation = applyPunctuation;
+window.loadPunctuator = loadPunctuator;
+window.MULTILINGUAL_PUNCT_LANGS = MULTILINGUAL_LANGS;

silero_vad.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
+size 2243022

style.css CHANGED Viewed

@@ -1,28 +1,638 @@
 body {
-	padding: 2rem;
-	font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
 }
 h1 {
-	font-size: 16px;
-	margin-top: 0;
 }
-p {
-	color: rgb(107, 114, 128);
-	font-size: 15px;
-	margin-bottom: 10px;
-	margin-top: 5px;
 }
-.card {
-	max-width: 620px;
-	margin: 0 auto;
-	padding: 16px;
-	border: 1px solid lightgray;
-	border-radius: 16px;
 }
-.card p:last-child {
-	margin-bottom: 0;
 }

+/* Granite Speech WebGPU - Whisper Web Style */
+* {
+    box-sizing: border-box;
+    margin: 0;
+    padding: 0;
+}
 body {
+    font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
+    background: #f8fafc;
+    color: #0f172a;
+    min-height: 100vh;
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    padding: 2rem;
+    line-height: 1.5;
+}
+.container {
+    width: 100%;
+    max-width: 540px;
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    gap: 1.5rem;
 }
+/* Header */
 h1 {
+    font-size: 3rem;
+    font-weight: 800;
+    letter-spacing: -0.025em;
+    color: #0f172a;
+    text-align: center;
+}
+h2 {
+    font-size: 1.25rem;
+    font-weight: 600;
+    letter-spacing: -0.015em;
+    color: #0f172a;
+    text-align: center;
+    padding: 0 1rem;
+    white-space: nowrap;
+}
+.privacy-note {
+    font-size: 0.8125rem;
+    color: #64748b;
+    text-align: center;
+}
+/* Input Card */
+.input-card {
+    width: 100%;
+    background: white;
+    border-radius: 0.5rem;
+    box-shadow: 0 25px 50px -12px rgba(0, 0, 0, 0.05);
+    border: 1px solid rgba(51, 65, 85, 0.1);
+    overflow: hidden;
+}
+.input-options {
+    display: flex;
+    align-items: stretch;
+    padding: 0.5rem;
+    gap: 0.5rem;
+}
+.input-tile {
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    gap: 0.5rem;
+    padding: 0.5rem 0.75rem;
+    background: transparent;
+    border: none;
+    border-radius: 0.5rem;
+    color: #64748b;
+    font-size: 0.9375rem;
+    cursor: pointer;
+    transition: all 0.2s;
+    flex: 1;
+}
+.input-tile:hover:not(:disabled) {
+    color: #4f46e5;
+    background: #eef2ff;
+}
+.input-tile.recording {
+    color: #ef4444;
+    background: #fef2f2;
+}
+.input-tile.recording:hover {
+    color: #dc2626;
+    background: #fee2e2;
+}
+.input-tile:disabled {
+    opacity: 0.5;
+    cursor: not-allowed;
+}
+.input-tile svg {
+    width: 1.75rem;
+    height: 1.75rem;
+    flex-shrink: 0;
+}
+.file-label {
+    cursor: pointer;
+    transition: all 0.2s;
+}
+.file-label.drag-over {
+    color: #2563eb;
+    background: #eff6ff;
+}
+.input-card.drag-over {
+    border: 2px dashed #2563eb;
+    background: #eff6ff;
+}
+.divider {
+    width: 1px;
+    background: #e2e8f0;
+    margin: 0.25rem 0;
+}
+/* Progress Bar */
+.progress-bar {
+    width: 100%;
+    height: 4px;
+    background: #e5e7eb;
+}
+.progress-fill {
+    height: 100%;
+    background: #2563eb;
+    width: 0%;
+    transition: width 0.1s;
+    border-radius: 0 2px 2px 0;
+}
+/* Status Section */
+.status-section {
+    display: flex;
+    align-items: center;
+    gap: 0.5rem;
+    font-size: 0.875rem;
+    color: #64748b;
+}
+.status-dot {
+    width: 8px;
+    height: 8px;
+    border-radius: 50%;
+    background: #94a3b8;
+}
+.status-dot.ready {
+    background: #22c55e;
+}
+.status-dot.loading,
+.status-dot.processing {
+    background: #f59e0b;
+    animation: pulse 1.5s infinite;
+}
+.status-dot.recording {
+    background: #ef4444;
+    animation: pulse 0.8s infinite;
+}
+.status-dot.error {
+    background: #ef4444;
+}
+@keyframes pulse {
+    0%, 100% { opacity: 1; }
+    50% { opacity: 0.5; }
+}
+/* Audio Player */
+.audio-player {
+    width: 100%;
+    display: flex;
+    align-items: center;
+    gap: 0.5rem;
+    padding: 0.25rem 0.5rem;
+    background: white;
+    border-radius: 0.5rem;
+    border: 1px solid rgba(51, 65, 85, 0.1);
+}
+.play-btn {
+    width: 24px;
+    height: 24px;
+    border-radius: 50%;
+    border: none;
+    background: #2563eb;
+    color: white;
+    cursor: pointer;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    flex-shrink: 0;
+    transition: background 0.2s;
+}
+.play-btn:hover {
+    background: #1d4ed8;
+}
+.play-btn svg {
+    width: 14px;
+    height: 14px;
+}
+.waveform-container {
+    flex: 1;
+    height: 24px;
+    position: relative;
+    cursor: pointer;
+    border-radius: 4px;
+    overflow: hidden;
+}
+#waveformCanvas {
+    width: 100%;
+    height: 100%;
+    display: block;
+}
+.waveform-progress {
+    position: absolute;
+    top: 0;
+    left: 0;
+    height: 100%;
+    width: 0%;
+    background: rgba(37, 99, 235, 0.3);
+    pointer-events: none;
+}
+.audio-time {
+    font-size: 0.8125rem;
+    color: #64748b;
+    font-variant-numeric: tabular-nums;
+    min-width: 3rem;
+    text-align: right;
+}
+.clear-btn {
+    width: 20px;
+    height: 20px;
+    border-radius: 50%;
+    border: none;
+    background: transparent;
+    color: #94a3b8;
+    cursor: pointer;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    flex-shrink: 0;
+    padding: 0;
+    transition: all 0.2s;
+}
+.clear-btn:hover {
+    color: #ef4444;
+    background: #fef2f2;
+}
+.clear-btn svg {
+    width: 14px;
+    height: 14px;
+}
+/* Transcribe Section */
+.transcribe-section {
+    width: 100%;
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    gap: 1rem;
+}
+.task-row {
+    display: flex;
+    align-items: center;
+    gap: 0.75rem;
+    font-size: 0.875rem;
+    color: #64748b;
+}
+.task-row select {
+    padding: 0.375rem 0.75rem;
+    font-size: 0.875rem;
+    background: white;
+    color: #0f172a;
+    border: 1px solid #d1d5db;
+    border-radius: 0.5rem;
+    cursor: pointer;
+    outline: none;
 }
+.task-row select:focus {
+    border-color: #2563eb;
+    box-shadow: 0 0 0 2px rgba(37, 99, 235, 0.2);
 }
+.transcribe-btn {
+    padding: 0.625rem 2.5rem;
+    font-size: 0.9375rem;
+    font-weight: 500;
+    background: #2563eb;
+    color: white;
+    border: none;
+    border-radius: 0.5rem;
+    cursor: pointer;
+    transition: background 0.2s;
 }
+.transcribe-btn:hover:not(:disabled) {
+    background: #1d4ed8;
+}
+.transcribe-btn:disabled {
+    opacity: 0.5;
+    cursor: not-allowed;
+}
+.checkbox-row {
+    display: flex;
+    align-items: center;
+    gap: 0.5rem;
+    font-size: 0.875rem;
+    color: #64748b;
+    cursor: pointer;
+}
+.checkbox-row input[type="checkbox"] {
+    width: 1rem;
+    height: 1rem;
+    cursor: pointer;
+    accent-color: #2563eb;
+}
+/* Model Progress */
+.model-progress {
+    font-size: 0.875rem;
+    color: #64748b;
+}
+/* Transcript Card */
+.transcript-card {
+    width: 100%;
+    background: white;
+    border-radius: 0.5rem;
+    box-shadow: 0 25px 50px -12px rgba(0, 0, 0, 0.05);
+    border: 1px solid rgba(51, 65, 85, 0.1);
+    overflow: hidden;
+}
+.transcript-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    padding: 0.75rem 1rem;
+    border-bottom: 1px solid #e2e8f0;
+    font-size: 0.875rem;
+    font-weight: 500;
+    color: #64748b;
+}
+.transcript-actions {
+    display: flex;
+    gap: 0.25rem;
+}
+.icon-btn {
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    width: 32px;
+    height: 32px;
+    background: transparent;
+    border: none;
+    border-radius: 0.375rem;
+    color: #64748b;
+    cursor: pointer;
+    transition: all 0.2s;
+}
+.icon-btn:hover {
+    background: #f1f5f9;
+    color: #0f172a;
+}
+.icon-btn svg {
+    width: 18px;
+    height: 18px;
+}
+.transcript-output {
+    padding: 1rem;
+    min-height: 100px;
+    max-height: 300px;
+    overflow-y: auto;
+    font-size: 0.9375rem;
+    line-height: 1.7;
+    color: #0f172a;
+    word-wrap: break-word;
+}
+.transcript-row {
+    display: flex;
+    gap: 0.75rem;
+    padding: 0.25rem 0;
+}
+.transcript-row .timestamp {
+    font-size: 0.9375rem;
+    color: #94a3b8;
+    flex-shrink: 0;
+    min-width: 3rem;
+    text-align: right;
+    font-variant-numeric: tabular-nums;
+}
+.transcript-row .transcript-text {
+    flex: 1;
+}
+/* Footer */
+.footer {
+    text-align: center;
+    font-size: 0.875rem;
+    color: #64748b;
+}
+.footer a {
+    color: #2563eb;
+    text-decoration: none;
+}
+.footer a:hover {
+    text-decoration: underline;
+}
+.gpu-info {
+    font-size: 0.75rem;
+    font-family: 'SF Mono', Monaco, 'Courier New', monospace;
+    color: #94a3b8;
+}
+/* Responsive */
+@media (max-width: 640px) {
+    body {
+        padding: 1rem;
+    }
+    h1 {
+        font-size: 2.25rem;
+    }
+    h2 {
+        font-size: 1rem;
+    }
+    .input-tile {
+        padding: 0.5rem;
+        font-size: 0.8125rem;
+    }
+    .input-tile svg {
+        width: 1.5rem;
+        height: 1.5rem;
+    }
+}
+/* Scrollbar */
+.transcript-output::-webkit-scrollbar {
+    width: 6px;
+}
+.transcript-output::-webkit-scrollbar-track {
+    background: #f1f5f9;
+}
+.transcript-output::-webkit-scrollbar-thumb {
+    background: #cbd5e1;
+    border-radius: 3px;
+}
+.transcript-output::-webkit-scrollbar-thumb:hover {
+    background: #94a3b8;
+}
+/* Dark Mode */
+@media (prefers-color-scheme: dark) {
+    body {
+        background: #0f172a;
+        color: #e2e8f0;
+    }
+    h1, h2 {
+        color: #f1f5f9;
+    }
+    .input-card,
+    .transcript-card,
+    .audio-player {
+        background: #1e293b;
+        border-color: rgba(148, 163, 184, 0.1);
+        box-shadow: 0 25px 50px -12px rgba(0, 0, 0, 0.3);
+    }
+    .audio-time {
+        color: #94a3b8;
+    }
+    .input-card.drag-over {
+        background: #1e3a5f;
+        border-color: #3b82f6;
+    }
+    .input-tile {
+        color: #94a3b8;
+    }
+    .input-tile:hover:not(:disabled) {
+        color: #818cf8;
+        background: #312e81;
+    }
+    .input-tile.recording {
+        color: #f87171;
+        background: #450a0a;
+    }
+    .input-tile.recording:hover {
+        color: #fca5a5;
+        background: #7f1d1d;
+    }
+    .divider {
+        background: #334155;
+    }
+    .progress-bar {
+        background: #334155;
+    }
+    .status-section,
+    .task-row,
+    .checkbox-row,
+    .model-progress {
+        color: #94a3b8;
+    }
+    .task-row select {
+        background: #1e293b;
+        color: #e2e8f0;
+        border-color: #475569;
+    }
+    .task-row select:focus {
+        border-color: #3b82f6;
+        box-shadow: 0 0 0 2px rgba(59, 130, 246, 0.3);
+    }
+    .transcript-header {
+        border-color: #334155;
+        color: #94a3b8;
+    }
+    .transcript-output {
+        color: #e2e8f0;
+    }
+    .transcript-row .timestamp {
+        color: #64748b;
+    }
+    .icon-btn {
+        color: #94a3b8;
+    }
+    .icon-btn:hover {
+        background: #334155;
+        color: #f1f5f9;
+    }
+    .clear-btn {
+        color: #64748b;
+    }
+    .clear-btn:hover {
+        color: #f87171;
+        background: #450a0a;
+    }
+    .footer {
+        color: #64748b;
+    }
+    .footer a {
+        color: #60a5fa;
+    }
+    .privacy-note {
+        color: #64748b;
+    }
+    .gpu-info {
+        color: #64748b;
+    }
+    /* Scrollbar dark */
+    .transcript-output::-webkit-scrollbar-track {
+        background: #1e293b;
+    }
+    .transcript-output::-webkit-scrollbar-thumb {
+        background: #475569;
+    }
+    .transcript-output::-webkit-scrollbar-thumb:hover {
+        background: #64748b;
+    }
 }

vad.js ADDED Viewed

	@@ -0,0 +1,157 @@

+/**
+ * Silero VAD for speech detection and silence trimming
+ * Based on the approach in ibm-granite/granite-speech HF demo
+ */
+let vadSession = null;
+const VAD_SAMPLE_RATE = 16000;
+const VAD_CHUNK_SIZE = 512;  // 32ms chunks at 16kHz
+// Load VAD model
+async function loadVAD() {
+    if (vadSession) return;
+    console.log('Loading VAD model...');
+    vadSession = await ort.InferenceSession.create('./silero_vad.onnx', {
+        executionProviders: ['wasm'],
+    });
+    console.log('VAD model loaded');
+}
+// Get speech timestamps using Silero VAD
+// Returns list of {start, end} in samples
+async function getSpeechTimestamps(audioData, threshold = 0.5) {
+    await loadVAD();
+    // Initialize state [2, 1, 128]
+    let state = new Float32Array(2 * 1 * 128);
+    const sr = BigInt(VAD_SAMPLE_RATE);
+    const speechProbs = [];
+    // Process in chunks
+    for (let i = 0; i < audioData.length; i += VAD_CHUNK_SIZE) {
+        const chunkEnd = Math.min(i + VAD_CHUNK_SIZE, audioData.length);
+        let chunk = new Float32Array(VAD_CHUNK_SIZE);
+        // Copy chunk data
+        for (let j = 0; j < chunkEnd - i; j++) {
+            chunk[j] = audioData[i + j];
+        }
+        // Run VAD
+        const inputTensor = new ort.Tensor('float32', chunk, [1, VAD_CHUNK_SIZE]);
+        const stateTensor = new ort.Tensor('float32', state, [2, 1, 128]);
+        const srTensor = new ort.Tensor('int64', BigInt64Array.from([sr]), []);
+        const outputs = await vadSession.run({
+            input: inputTensor,
+            state: stateTensor,
+            sr: srTensor
+        });
+        speechProbs.push(outputs.output.data[0]);
+        state = new Float32Array(outputs.stateN.data);
+    }
+    // Find speech segments
+    const segments = [];
+    let inSpeech = false;
+    let speechStart = 0;
+    for (let i = 0; i < speechProbs.length; i++) {
+        const isSpeech = speechProbs[i] >= threshold;
+        if (isSpeech && !inSpeech) {
+            speechStart = i * VAD_CHUNK_SIZE;
+            inSpeech = true;
+        } else if (!isSpeech && inSpeech) {
+            segments.push({
+                start: speechStart,
+                end: i * VAD_CHUNK_SIZE
+            });
+            inSpeech = false;
+        }
+    }
+    if (inSpeech) {
+        segments.push({
+            start: speechStart,
+            end: audioData.length
+        });
+    }
+    return segments;
+}
+// Get speech segments with merging (like granite-speech demo)
+// Returns segments with start/end in seconds
+async function getSpeechSegments(audioData, sampleRate = VAD_SAMPLE_RATE) {
+    const vadSegments = await getSpeechTimestamps(audioData);
+    if (vadSegments.length === 0) {
+        return [{ start: 0, end: audioData.length / sampleRate }];
+    }
+    // Convert to seconds and apply buffering/merging
+    const startBuffer = 0.3;  // seconds - pad segment start
+    const minGap = 0.5;       // seconds - merge segments with small gaps
+    const segments = [];
+    for (const seg of vadSegments) {
+        const startSec = seg.start / sampleRate;
+        const endSec = seg.end / sampleRate;
+        const bufferedStart = Math.max(0, startSec - startBuffer);
+        if (segments.length > 0 && bufferedStart - segments[segments.length - 1].end < minGap) {
+            // Merge with previous segment
+            segments[segments.length - 1].end = endSec;
+        } else {
+            // Start new segment
+            if (segments.length > 0) {
+                // Extend to previous end to avoid gaps
+                segments.push({ start: segments[segments.length - 1].end, end: endSec });
+            } else {
+                segments.push({ start: bufferedStart, end: endSec });
+            }
+        }
+    }
+    return segments;
+}
+// Trim silence from audio (simple version - just trim start/end)
+async function trimSilence(audioData, sampleRate = VAD_SAMPLE_RATE) {
+    const segments = await getSpeechTimestamps(audioData);
+    if (segments.length === 0) {
+        console.log('VAD: No speech detected, returning original audio');
+        return audioData;
+    }
+    // Add padding (300ms)
+    const paddingSamples = Math.floor(0.3 * sampleRate);
+    const start = Math.max(0, segments[0].start - paddingSamples);
+    const end = Math.min(audioData.length, segments[segments.length - 1].end + paddingSamples);
+    const trimmedStart = (start / sampleRate).toFixed(2);
+    const trimmedEnd = ((audioData.length - end) / sampleRate).toFixed(2);
+    console.log(`VAD: Trimmed ${trimmedStart}s from start, ${trimmedEnd}s from end`);
+    return audioData.slice(start, end);
+}
+// Format timestamp as MM:SS
+function formatTimestamp(seconds) {
+    const mins = Math.floor(seconds / 60);
+    const secs = Math.floor(seconds % 60);
+    return `${mins}:${secs.toString().padStart(2, '0')}`;
+}
+// Export
+window.loadVAD = loadVAD;
+window.trimSilence = trimSilence;
+window.getSpeechSegments = getSpeechSegments;
+window.formatTimestamp = formatTimestamp;