gsaon commited on
Commit
dbc1132
·
verified ·
1 Parent(s): b7e38a0

Upload folder using huggingface_hub

Browse files
Files changed (9) hide show
  1. README.md +3 -6
  2. app.js +1095 -0
  3. index.html +134 -18
  4. pcs_vocab.json +0 -0
  5. punct_cap_seg_en.onnx +3 -0
  6. punctuator.js +294 -0
  7. silero_vad.onnx +3 -0
  8. style.css +627 -17
  9. vad.js +157 -0
README.md CHANGED
@@ -1,11 +1,8 @@
1
  ---
2
- title: Granite Speech Webgpu
3
- emoji: 🚀
4
- colorFrom: pink
5
  colorTo: indigo
6
  sdk: static
7
  pinned: false
8
- license: apache-2.0
9
  ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Granite Speech WebGPU
3
+ emoji: 🗣️
4
+ colorFrom: blue
5
  colorTo: indigo
6
  sdk: static
7
  pinned: false
 
8
  ---
 
 
app.js ADDED
@@ -0,0 +1,1095 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Granite Speech WebGPU Demo
3
+ * Uses ONNX Runtime Web for in-browser speech recognition
4
+ */
5
+
6
+ import { PreTrainedTokenizer } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.4.2';
7
+ import { detect } from 'https://cdn.jsdelivr.net/npm/tinyld/+esm';
8
+
9
+ // Check if ONNX Runtime is loaded
10
+ if (typeof ort === 'undefined') {
11
+ console.error('ONNX Runtime Web not loaded! Check if the script tag is correct.');
12
+ alert('Failed to load ONNX Runtime. Please refresh the page.');
13
+ } else {
14
+ // Configure WASM paths to CDN
15
+ ort.env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.24.3/dist/';
16
+
17
+ // WASM settings - enable multi-threading for encoder performance
18
+ ort.env.wasm.numThreads = navigator.hardwareConcurrency || 4;
19
+ ort.env.wasm.simd = true;
20
+
21
+ // WebGPU settings
22
+ ort.env.webgpu = ort.env.webgpu || {};
23
+ }
24
+
25
+ // Model paths
26
+ // Granite Speech ONNX models hosted on HF Hub
27
+ const HF_MODEL_BASE = 'https://huggingface.co/ibm-granite/granite-4.0-1b-speech/resolve/main/onnx';
28
+ const ENCODER_PATH = `${HF_MODEL_BASE}/audio_encoder_q4f32.onnx`;
29
+ const EMBED_PATH = `${HF_MODEL_BASE}/embed_tokens_q4f16.onnx`;
30
+ const DECODER_PATH = `${HF_MODEL_BASE}/decoder_model_merged_q4f16.onnx`;
31
+
32
+ // Audio config from preprocessor_config.json
33
+ const SAMPLE_RATE = 16000;
34
+ const N_MELS = 80;
35
+ const N_FFT = 512;
36
+ const HOP_LENGTH = 160;
37
+ const WIN_LENGTH = 400;
38
+
39
+ // Model config
40
+ const HIDDEN_SIZE = 2048;
41
+ const VOCAB_SIZE = 100353;
42
+ const BOS_TOKEN = 100257;
43
+ const EOS_TOKEN = 100257;
44
+ const PAD_TOKEN = 100256;
45
+ const MAX_NEW_TOKENS = 256;
46
+ // Note: embedding_multiplier (12) is likely already applied in the model weights
47
+
48
+ // Prompt templates
49
+ const PROMPT_PREFIX = 'USER: ';
50
+ const PROMPTS = {
51
+ 'transcribe': 'Transcribe the speech to text\n ASSISTANT:',
52
+ 'translate_en': 'Translate the speech to English\n ASSISTANT:',
53
+ 'translate_fr': 'Translate the speech to French\n ASSISTANT:',
54
+ 'translate_de': 'Translate the speech to German\n ASSISTANT:',
55
+ 'translate_es': 'Translate the speech to Spanish\n ASSISTANT:',
56
+ 'translate_pt': 'Translate the speech to Portuguese\n ASSISTANT:',
57
+ 'translate_ja': 'Translate the speech to Japanese\n ASSISTANT:',
58
+ };
59
+
60
+ // State
61
+ let encoderSession = null;
62
+ let embedSession = null;
63
+ let decoderSession = null;
64
+ let tokenizer = null;
65
+ let isModelLoading = false;
66
+ let currentAudioData = null;
67
+
68
+ // Pre-computed prompt embeddings (populated at init)
69
+ const promptEmbeddings = {
70
+ prefix: null, // "USER: "
71
+ // suffix embeddings keyed by prompt name
72
+ };
73
+
74
+ // DOM Elements
75
+ const statusDot = document.getElementById('statusDot');
76
+ const statusText = document.getElementById('statusText');
77
+ const recordBtn = document.getElementById('recordBtn');
78
+ const audioFile = document.getElementById('audioFile');
79
+ const fileTile = document.querySelector('.file-label');
80
+ const inputCard = document.querySelector('.input-card');
81
+ const audioPreview = document.getElementById('audioPreview');
82
+ const audioPlayer = document.getElementById('audioPlayer');
83
+ const playBtn = document.getElementById('playBtn');
84
+ const waveformCanvas = document.getElementById('waveformCanvas');
85
+ const waveformProgress = document.getElementById('waveformProgress');
86
+ const audioTime = document.getElementById('audioTime');
87
+ const transcribeSection = document.getElementById('transcribeSection');
88
+ const transcribeBtn = document.getElementById('transcribeBtn');
89
+ const promptSelect = document.getElementById('promptSelect');
90
+ const punctuationCheckbox = document.getElementById('punctuationCheckbox');
91
+ const transcriptCard = document.getElementById('transcriptCard');
92
+ const outputText = document.getElementById('outputText');
93
+ const copyBtn = document.getElementById('copyBtn');
94
+ const downloadBtn = document.getElementById('downloadBtn');
95
+ const clearBtn = document.getElementById('clearBtn');
96
+ const progressSection = document.getElementById('progressSection');
97
+ const progressFill = document.getElementById('progressFill');
98
+ const progressText = document.getElementById('progressText');
99
+ const gpuInfo = document.getElementById('gpuInfo');
100
+
101
+ // Recording state
102
+ let mediaRecorder = null;
103
+ let audioChunks = [];
104
+ let transcriptionAborted = false;
105
+
106
+ // Utility functions
107
+ function setStatus(status, message) {
108
+ statusDot.className = `status-dot ${status}`;
109
+ statusText.textContent = message;
110
+ }
111
+
112
+ // Punctuation is handled by punctuator.js (applyPunctuation function)
113
+
114
+ function showProgress(show) {
115
+ progressSection.style.display = show ? 'block' : 'none';
116
+ }
117
+
118
+ function updateProgress(progress, text) {
119
+ progressFill.style.width = `${progress}%`;
120
+ progressText.textContent = text;
121
+ }
122
+
123
+ // Check WebGPU support
124
+ async function checkWebGPU() {
125
+ if (!navigator.gpu) {
126
+ gpuInfo.textContent = 'WebGPU not supported. Use Chrome 113+ or Edge 113+';
127
+ gpuInfo.style.color = '#e74c3c';
128
+ return false;
129
+ }
130
+
131
+ try {
132
+ const adapter = await navigator.gpu.requestAdapter();
133
+ if (!adapter) {
134
+ gpuInfo.textContent = 'No WebGPU adapter available';
135
+ gpuInfo.style.color = '#f39c12';
136
+ return false;
137
+ }
138
+
139
+ return true;
140
+ } catch (e) {
141
+ console.error('WebGPU error:', e);
142
+ gpuInfo.textContent = `WebGPU error: ${e.message || e}`;
143
+ gpuInfo.style.color = '#e74c3c';
144
+ return false;
145
+ }
146
+ }
147
+
148
+ // Load tokenizer using transformers.js
149
+ async function loadTokenizer() {
150
+ const [tokenizerJson, tokenizerConfig] = await Promise.all([
151
+ fetch('https://huggingface.co/ibm-granite/granite-4.0-1b-speech/resolve/main/tokenizer.json').then(r => r.json()),
152
+ fetch('https://huggingface.co/ibm-granite/granite-4.0-1b-speech/resolve/main/tokenizer_config.json').then(r => r.json())
153
+ ]);
154
+ return new PreTrainedTokenizer(tokenizerJson, tokenizerConfig);
155
+ }
156
+
157
+ // Get embeddings for token IDs (returns Float32Array)
158
+ async function getEmbeddings(tokenIds) {
159
+ const idsTensor = new ort.Tensor('int64', BigInt64Array.from(tokenIds.map(BigInt)), [1, tokenIds.length]);
160
+ const output = await embedSession.run({ input_ids: idsTensor });
161
+ return {
162
+ data: new Float32Array(output.inputs_embeds.data),
163
+ seqLen: output.inputs_embeds.dims[1]
164
+ };
165
+ }
166
+
167
+ // Pre-compute embeddings for all prompts
168
+ async function precomputePromptEmbeddings() {
169
+ // Prefix embedding
170
+ const prefixTokens = tokenizer.encode(PROMPT_PREFIX, { add_special_tokens: false });
171
+ promptEmbeddings.prefix = await getEmbeddings(prefixTokens);
172
+
173
+ // Suffix embeddings for each prompt
174
+ for (const [key, text] of Object.entries(PROMPTS)) {
175
+ const tokens = tokenizer.encode(text, { add_special_tokens: false });
176
+ promptEmbeddings[key] = await getEmbeddings(tokens);
177
+ }
178
+
179
+ console.log('Pre-computed embeddings for', Object.keys(promptEmbeddings).length, 'prompts');
180
+ }
181
+
182
+ // Session options - WebGPU only (no WASM fallback)
183
+ const sessionOptions = {
184
+ executionProviders: ['webgpu'],
185
+ enableMemPattern: false,
186
+ enableCpuMemArena: false,
187
+ graphOptimizationLevel: 'basic',
188
+ };
189
+
190
+ // Force garbage collection pause
191
+ async function gcPause() {
192
+ // Give browser time to garbage collect
193
+ await new Promise(resolve => setTimeout(resolve, 100));
194
+ }
195
+
196
+ // Load ONNX model with external data support
197
+ async function loadModelWithExternalData(modelPath, options) {
198
+ // Check if external data file exists
199
+ const dataPath = modelPath.replace('.onnx', '.onnx_data');
200
+
201
+ const modelResponse = await fetch(modelPath);
202
+ const modelBuffer = await modelResponse.arrayBuffer();
203
+
204
+ const dataResponse = await fetch(dataPath);
205
+ if (!dataResponse.ok) {
206
+ // No external data, load model directly
207
+ return await ort.InferenceSession.create(modelBuffer, options);
208
+ }
209
+
210
+ const dataBuffer = await dataResponse.arrayBuffer();
211
+
212
+ // Extract filename from path for external data reference
213
+ const dataFileName = dataPath.split('/').pop();
214
+
215
+
216
+ // Create session with external data
217
+ const sessionOptionsWithData = {
218
+ ...options,
219
+ externalData: [
220
+ {
221
+ path: dataFileName,
222
+ data: dataBuffer,
223
+ }
224
+ ]
225
+ };
226
+
227
+ return await ort.InferenceSession.create(modelBuffer, sessionOptionsWithData);
228
+ }
229
+
230
+ // Initialize ONNX Runtime and load models
231
+ async function initModels() {
232
+ if (isModelLoading) return;
233
+ isModelLoading = true;
234
+
235
+ setStatus('loading', 'Loading models...');
236
+ showProgress(true);
237
+
238
+ try {
239
+ const hasWebGPU = await checkWebGPU();
240
+
241
+ updateProgress(10, 'Initializing ONNX Runtime...');
242
+
243
+ // Load tokenizer
244
+ updateProgress(15, 'Loading tokenizer...');
245
+ tokenizer = await loadTokenizer();
246
+
247
+ // Load models one at a time with GC pauses between
248
+ // Use loadModelWithExternalData to handle .onnx_data files
249
+
250
+ // Load encoder model (q4f32 with WebGPU)
251
+ updateProgress(20, 'Loading encoder model...');
252
+ encoderSession = await loadModelWithExternalData(ENCODER_PATH, sessionOptions);
253
+
254
+ await gcPause();
255
+
256
+ // Load embed tokens model
257
+ updateProgress(40, 'Loading embed tokens model...');
258
+ embedSession = await loadModelWithExternalData(EMBED_PATH, sessionOptions);
259
+
260
+ // Pre-compute prompt embeddings
261
+ updateProgress(50, 'Pre-computing prompt embeddings...');
262
+ await precomputePromptEmbeddings();
263
+
264
+ await gcPause();
265
+
266
+ // Load decoder model
267
+ updateProgress(60, 'Loading decoder model...');
268
+ decoderSession = await loadModelWithExternalData(DECODER_PATH, sessionOptions);
269
+
270
+ updateProgress(100, 'Models loaded!');
271
+ showProgress(false);
272
+ setStatus('ready', 'Ready - Record or upload audio');
273
+ enableControls(true);
274
+
275
+ } catch (error) {
276
+ console.error('Model loading failed:', error);
277
+ console.error('Error stack:', error?.stack);
278
+ const errorMsg = error?.message || error?.toString() || 'Unknown error';
279
+ setStatus('error', `Error: ${errorMsg}`);
280
+ showProgress(false);
281
+ isModelLoading = false;
282
+ }
283
+ }
284
+
285
+ function enableControls(enabled) {
286
+ recordBtn.disabled = !enabled;
287
+ audioFile.disabled = !enabled;
288
+ }
289
+
290
+ // Mel spectrogram computation
291
+ // Uses custom implementation matching torchaudio
292
+ function computeMelSpectrogram(audioData) {
293
+ // Pad signal with reflection (center=True, pad_mode='reflect')
294
+ const padLength = Math.floor(N_FFT / 2);
295
+ const paddedLength = audioData.length + 2 * padLength;
296
+ const paddedAudio = new Float32Array(paddedLength);
297
+
298
+ // Reflect padding at start: for position -i, use position i (not i-1)
299
+ // numpy reflect: for index -1, reflects to index 1
300
+ for (let i = 0; i < padLength; i++) {
301
+ // Position -(i+1) reflects to position (i+1)
302
+ const srcIdx = Math.min(i + 1, audioData.length - 1);
303
+ paddedAudio[padLength - 1 - i] = audioData[srcIdx];
304
+ }
305
+ // Copy original audio
306
+ for (let i = 0; i < audioData.length; i++) {
307
+ paddedAudio[padLength + i] = audioData[i];
308
+ }
309
+ // Reflect padding at end
310
+ for (let i = 0; i < padLength; i++) {
311
+ const srcIdx = Math.max(0, audioData.length - 2 - i);
312
+ paddedAudio[padLength + audioData.length + i] = audioData[srcIdx];
313
+ }
314
+
315
+ // Calculate number of frames
316
+ const numFrames = Math.floor((paddedLength - N_FFT) / HOP_LENGTH) + 1;
317
+
318
+ // Create mel filterbank (torchaudio HTK style)
319
+ const melFilterbank = createMelFilterbank(N_FFT, N_MELS, SAMPLE_RATE);
320
+
321
+ // Hann window (periodic=True like torchaudio)
322
+ const window = new Float32Array(WIN_LENGTH);
323
+ for (let i = 0; i < WIN_LENGTH; i++) {
324
+ window[i] = 0.5 * (1 - Math.cos(2 * Math.PI * i / WIN_LENGTH));
325
+ }
326
+
327
+ const melSpec = new Float32Array(numFrames * N_MELS);
328
+
329
+ // torch.stft center-pads the window when win_length < n_fft
330
+ // Window is placed at indices padLeft to padLeft+win_length
331
+ const padLeft = Math.floor((N_FFT - WIN_LENGTH) / 2); // = 56
332
+
333
+ for (let frame = 0; frame < numFrames; frame++) {
334
+ const start = frame * HOP_LENGTH;
335
+
336
+ // Apply center-padded window (matching torch.stft behavior)
337
+ // Read n_fft samples, apply window centered in the middle
338
+ const windowed = new Float32Array(N_FFT); // initialized to zeros
339
+ for (let i = 0; i < WIN_LENGTH; i++) {
340
+ windowed[padLeft + i] = paddedAudio[start + padLeft + i] * window[i];
341
+ }
342
+
343
+ // Compute power spectrum
344
+ const powerSpec = computePowerSpectrum(windowed);
345
+
346
+ // Apply mel filterbank and log10
347
+ for (let m = 0; m < N_MELS; m++) {
348
+ let sum = 0;
349
+ for (let k = 0; k < N_FFT / 2 + 1; k++) {
350
+ sum += powerSpec[k] * melFilterbank[m * (N_FFT / 2 + 1) + k];
351
+ }
352
+ melSpec[frame * N_MELS + m] = Math.log10(Math.max(sum, 1e-10));
353
+ }
354
+ }
355
+
356
+ return { data: melSpec, numFrames, numMels: N_MELS };
357
+ }
358
+
359
+ // Create mel filterbank (torchaudio HTK style)
360
+ function createMelFilterbank(nfft, nMels, sampleRate) {
361
+ const numBins = nfft / 2 + 1;
362
+ const filterbank = new Float32Array(nMels * numBins);
363
+
364
+ // HTK mel scale
365
+ const hzToMel = (hz) => 2595 * Math.log10(1 + hz / 700);
366
+ const melToHz = (mel) => 700 * (Math.pow(10, mel / 2595) - 1);
367
+
368
+ const fMin = 0;
369
+ const fMax = sampleRate / 2;
370
+ const melMin = hzToMel(fMin);
371
+ const melMax = hzToMel(fMax);
372
+
373
+ // Create mel-spaced frequency points (n_mels + 2 points)
374
+ const fPts = new Float32Array(nMels + 2);
375
+ for (let i = 0; i < nMels + 2; i++) {
376
+ fPts[i] = melToHz(melMin + (melMax - melMin) * i / (nMels + 1));
377
+ }
378
+
379
+ // Create frequency array for each FFT bin
380
+ const allFreqs = new Float32Array(numBins);
381
+ for (let i = 0; i < numBins; i++) {
382
+ allFreqs[i] = i * sampleRate / nfft;
383
+ }
384
+
385
+ // Compute frequency differences
386
+ const fDiff = new Float32Array(nMels + 1);
387
+ for (let i = 0; i < nMels + 1; i++) {
388
+ fDiff[i] = fPts[i + 1] - fPts[i];
389
+ }
390
+
391
+ // Create triangular filters using slopes (torchaudio style)
392
+ for (let m = 0; m < nMels; m++) {
393
+ for (let k = 0; k < numBins; k++) {
394
+ const freq = allFreqs[k];
395
+ const lowSlope = (freq - fPts[m]) / fDiff[m];
396
+ const upSlope = (fPts[m + 2] - freq) / fDiff[m + 1];
397
+ filterbank[m * numBins + k] = Math.max(0, Math.min(lowSlope, upSlope));
398
+ }
399
+ }
400
+
401
+ return filterbank;
402
+ }
403
+
404
+ // Compute power spectrum using radix-2 FFT
405
+ function computePowerSpectrum(signal) {
406
+ const n = signal.length;
407
+
408
+ // Use radix-2 FFT for power of 2 lengths
409
+ if ((n & (n - 1)) === 0) {
410
+ return computePowerSpectrumFFT(signal);
411
+ }
412
+
413
+ // Fallback to DFT for non-power-of-2
414
+ const spectrum = new Float32Array(n / 2 + 1);
415
+ for (let k = 0; k <= n / 2; k++) {
416
+ let real = 0, imag = 0;
417
+ for (let t = 0; t < n; t++) {
418
+ const angle = -2 * Math.PI * k * t / n;
419
+ real += signal[t] * Math.cos(angle);
420
+ imag += signal[t] * Math.sin(angle);
421
+ }
422
+ spectrum[k] = real * real + imag * imag;
423
+ }
424
+ return spectrum;
425
+ }
426
+
427
+ // Radix-2 FFT for power spectrum
428
+ function computePowerSpectrumFFT(signal) {
429
+ const n = signal.length;
430
+
431
+ // Bit-reversal permutation
432
+ const real = new Float32Array(n);
433
+ const imag = new Float32Array(n);
434
+
435
+ for (let i = 0; i < n; i++) {
436
+ let j = 0;
437
+ let x = i;
438
+ for (let k = 0; k < Math.log2(n); k++) {
439
+ j = (j << 1) | (x & 1);
440
+ x >>= 1;
441
+ }
442
+ real[j] = signal[i];
443
+ }
444
+
445
+ // Cooley-Tukey FFT
446
+ for (let size = 2; size <= n; size *= 2) {
447
+ const halfSize = size / 2;
448
+ const step = Math.PI / halfSize;
449
+
450
+ for (let i = 0; i < n; i += size) {
451
+ for (let j = 0; j < halfSize; j++) {
452
+ const angle = -j * step;
453
+ const cos = Math.cos(angle);
454
+ const sin = Math.sin(angle);
455
+
456
+ const idx1 = i + j;
457
+ const idx2 = i + j + halfSize;
458
+
459
+ const tReal = cos * real[idx2] - sin * imag[idx2];
460
+ const tImag = sin * real[idx2] + cos * imag[idx2];
461
+
462
+ real[idx2] = real[idx1] - tReal;
463
+ imag[idx2] = imag[idx1] - tImag;
464
+ real[idx1] = real[idx1] + tReal;
465
+ imag[idx1] = imag[idx1] + tImag;
466
+ }
467
+ }
468
+ }
469
+
470
+ // Compute power spectrum (first half + DC and Nyquist)
471
+ const spectrum = new Float32Array(n / 2 + 1);
472
+ for (let k = 0; k <= n / 2; k++) {
473
+ spectrum[k] = real[k] * real[k] + imag[k] * imag[k];
474
+ }
475
+
476
+ return spectrum;
477
+ }
478
+
479
+ // Prepare audio features for encoder
480
+ function prepareAudioFeatures(audioData) {
481
+ const melSpec = computeMelSpectrogram(audioData);
482
+
483
+ // Apply Granite Speech normalization:
484
+ // 1. Already have log10 mel from computeMelSpectrogram
485
+ // 2. Normalize: max(logmel, max - 8) / 4 + 1
486
+ const logmel = melSpec.data;
487
+ let maxVal = -Infinity;
488
+ for (let i = 0; i < logmel.length; i++) {
489
+ if (logmel[i] > maxVal) maxVal = logmel[i];
490
+ }
491
+
492
+ const normalized = new Float32Array(logmel.length);
493
+ for (let i = 0; i < logmel.length; i++) {
494
+ normalized[i] = Math.max(logmel[i], maxVal - 8) / 4 + 1;
495
+ }
496
+
497
+ // Remove last frame if odd
498
+ let numFrames = melSpec.numFrames;
499
+ if (numFrames % 2 === 1) {
500
+ numFrames -= 1;
501
+ }
502
+
503
+ // Stack 2 consecutive frames -> 160 features (80 mels * 2)
504
+ const stackedFrames = numFrames / 2;
505
+ const features = new Float32Array(stackedFrames * 160);
506
+
507
+ for (let t = 0; t < stackedFrames; t++) {
508
+ // First frame of pair
509
+ for (let m = 0; m < N_MELS; m++) {
510
+ features[t * 160 + m] = normalized[(t * 2) * N_MELS + m];
511
+ }
512
+ // Second frame of pair
513
+ for (let m = 0; m < N_MELS; m++) {
514
+ features[t * 160 + N_MELS + m] = normalized[(t * 2 + 1) * N_MELS + m];
515
+ }
516
+ }
517
+
518
+ return { data: features, shape: [1, stackedFrames, 160] };
519
+ }
520
+
521
+ // Transcribe a single audio segment and return the text
522
+ async function transcribeSegment(audioSegment, onPartialResult) {
523
+ // Prepare audio features
524
+ const audioFeatures = prepareAudioFeatures(audioSegment);
525
+
526
+ // Run encoder
527
+ const encoderInput = new ort.Tensor('float32', audioFeatures.data, audioFeatures.shape);
528
+ const encoderOutput = await encoderSession.run({ input_features: encoderInput });
529
+ const audioEmbeddings = encoderOutput.audio_features;
530
+
531
+ // Get pre-computed prompt embeddings
532
+ const prefixEmbed = promptEmbeddings.prefix;
533
+ const suffixEmbed = promptEmbeddings[promptSelect.value] || promptEmbeddings['transcribe'];
534
+
535
+ // Concatenate embeddings using TypedArray.set()
536
+ const prefixSeqLen = prefixEmbed.seqLen;
537
+ const audioSeqLen = audioEmbeddings.dims[1];
538
+ const suffixSeqLen = suffixEmbed.seqLen;
539
+ const totalSeqLen = prefixSeqLen + audioSeqLen + suffixSeqLen;
540
+
541
+ const combinedEmbeds = new Float32Array(totalSeqLen * HIDDEN_SIZE);
542
+ combinedEmbeds.set(prefixEmbed.data, 0);
543
+ combinedEmbeds.set(new Float32Array(audioEmbeddings.data), prefixSeqLen * HIDDEN_SIZE);
544
+ combinedEmbeds.set(suffixEmbed.data, (prefixSeqLen + audioSeqLen) * HIDDEN_SIZE);
545
+
546
+ // Autoregressive generation
547
+ let generatedTokens = [];
548
+ let currentEmbeds = combinedEmbeds;
549
+ let currentSeqLen = totalSeqLen;
550
+ let pastKeyValues = null;
551
+ const numLayers = 40;
552
+ let totalSeqLenSoFar = totalSeqLen;
553
+
554
+ for (let step = 0; step < MAX_NEW_TOKENS; step++) {
555
+ const attentionMask = new BigInt64Array(totalSeqLenSoFar).fill(1n);
556
+
557
+ const embedsTensor = new ort.Tensor('float32', currentEmbeds, [1, currentSeqLen, HIDDEN_SIZE]);
558
+ const maskTensor = new ort.Tensor('int64', attentionMask, [1, totalSeqLenSoFar]);
559
+
560
+ const decoderInputs = {
561
+ inputs_embeds: embedsTensor,
562
+ attention_mask: maskTensor,
563
+ };
564
+
565
+ if (pastKeyValues) {
566
+ for (let i = 0; i < numLayers; i++) {
567
+ decoderInputs[`past_key_values.${i}.key`] = pastKeyValues[`present.${i}.key`];
568
+ decoderInputs[`past_key_values.${i}.value`] = pastKeyValues[`present.${i}.value`];
569
+ }
570
+ } else {
571
+ const emptyPast = new Uint16Array(0);
572
+ for (let i = 0; i < numLayers; i++) {
573
+ decoderInputs[`past_key_values.${i}.key`] = new ort.Tensor('float16', emptyPast, [1, 4, 0, 128]);
574
+ decoderInputs[`past_key_values.${i}.value`] = new ort.Tensor('float16', emptyPast, [1, 4, 0, 128]);
575
+ }
576
+ }
577
+
578
+ const decoderOutput = await decoderSession.run(decoderInputs);
579
+ pastKeyValues = decoderOutput;
580
+
581
+ const logitsFloat32 = Float32Array.from(decoderOutput.logits.data);
582
+
583
+ // Extract logits for last position and find argmax
584
+ const logitOffset = (currentSeqLen - 1) * VOCAB_SIZE;
585
+ const lastLogits = logitsFloat32.subarray(logitOffset, logitOffset + VOCAB_SIZE);
586
+
587
+ let nextToken = 0, maxVal = lastLogits[0];
588
+ for (let i = 1; i < VOCAB_SIZE; i++) {
589
+ if (lastLogits[i] > maxVal) { maxVal = lastLogits[i]; nextToken = i; }
590
+ }
591
+
592
+ // Avoid EOS on first token - take second best
593
+ if (step === 0 && nextToken === EOS_TOKEN) {
594
+ nextToken = 0; maxVal = -Infinity;
595
+ for (let i = 0; i < VOCAB_SIZE; i++) {
596
+ if (i !== EOS_TOKEN && lastLogits[i] > maxVal) { maxVal = lastLogits[i]; nextToken = i; }
597
+ }
598
+ }
599
+
600
+ if (nextToken === EOS_TOKEN) {
601
+ break;
602
+ }
603
+
604
+ generatedTokens.push(nextToken);
605
+
606
+ // Callback for streaming updates
607
+ if (onPartialResult) {
608
+ onPartialResult(tokenizer.decode(generatedTokens));
609
+ }
610
+
611
+ const nextTokenTensor = new ort.Tensor('int64', BigInt64Array.from([BigInt(nextToken)]), [1, 1]);
612
+ const nextEmbedOutput = await embedSession.run({ input_ids: nextTokenTensor });
613
+ currentEmbeds = new Float32Array(nextEmbedOutput.inputs_embeds.data);
614
+ currentSeqLen = 1;
615
+ totalSeqLenSoFar += 1;
616
+ }
617
+
618
+ return tokenizer.decode(generatedTokens);
619
+ }
620
+
621
+ // Wait until audio playback reaches a specific time
622
+ function waitForPlaybackTime(targetTime) {
623
+ return new Promise((resolve) => {
624
+ const check = () => {
625
+ if (audioPlayer.paused || audioPlayer.currentTime >= targetTime) {
626
+ resolve();
627
+ } else {
628
+ requestAnimationFrame(check);
629
+ }
630
+ };
631
+ check();
632
+ });
633
+ }
634
+
635
+ // Run inference with segmentation and audio sync
636
+ async function transcribe() {
637
+ if (!encoderSession || !embedSession || !decoderSession || !currentAudioData) {
638
+ setStatus('error', 'Model or audio not ready');
639
+ return;
640
+ }
641
+
642
+ setStatus('processing', 'Processing audio...');
643
+ transcribeBtn.disabled = true;
644
+ transcriptionAborted = false;
645
+ outputText.textContent = '';
646
+ transcriptCard.style.display = 'block';
647
+ showProgress(true);
648
+
649
+ try {
650
+ // Get speech segments using VAD
651
+ updateProgress(5, 'Detecting speech segments...');
652
+ const segments = await getSpeechSegments(currentAudioData, SAMPLE_RATE);
653
+ console.log(`VAD found ${segments.length} segment(s)`);
654
+
655
+ // Start audio playback immediately
656
+ audioPlayer.currentTime = 0;
657
+ audioPlayer.play();
658
+ playBtn.querySelector('.play-icon').style.display = 'none';
659
+ playBtn.querySelector('.pause-icon').style.display = 'block';
660
+ const playbackStartTime = performance.now() / 1000;
661
+
662
+ // Process and display segments in sync with audio
663
+ const displayedResults = [];
664
+ const totalSegments = segments.length;
665
+
666
+ for (let segIdx = 0; segIdx < totalSegments; segIdx++) {
667
+ if (transcriptionAborted) break;
668
+
669
+ const seg = segments[segIdx];
670
+
671
+ // Update progress bar
672
+ const segProgress = ((segIdx + 1) / totalSegments) * 100;
673
+ updateProgress(segProgress, '');
674
+
675
+ // Wait for audio to reach this segment's start time
676
+ const elapsed = (performance.now() / 1000) - playbackStartTime;
677
+ const waitTime = seg.start - elapsed;
678
+ if (waitTime > 0) {
679
+ await new Promise(resolve => setTimeout(resolve, waitTime * 1000));
680
+ }
681
+
682
+ setStatus('processing', `Segment ${segIdx + 1}/${totalSegments}`);
683
+
684
+ // Extract and transcribe this segment
685
+ const startSample = Math.floor(seg.start * SAMPLE_RATE);
686
+ const endSample = Math.floor(seg.end * SAMPLE_RATE);
687
+ const audioSegment = currentAudioData.slice(startSample, endSample);
688
+
689
+ const timestamp = formatTimestamp(seg.start);
690
+ const makeRow = (ts, text) => `<div class="transcript-row"><span class="timestamp">${ts}</span><span class="transcript-text">${text}</span></div>`;
691
+
692
+ // Transcribe with streaming display
693
+ const segmentText = await transcribeSegment(audioSegment, (partial) => {
694
+ const escaped = partial.replace(/</g, '&lt;').replace(/>/g, '&gt;');
695
+ const rows = [...displayedResults, makeRow(timestamp, escaped)];
696
+ outputText.innerHTML = rows.join('');
697
+ outputText.scrollTop = outputText.scrollHeight;
698
+ });
699
+
700
+ if (segmentText.trim()) {
701
+ let finalSegmentText = segmentText.trim();
702
+ // Auto-detect language and apply punctuation if supported
703
+ if (punctuationCheckbox.checked) {
704
+ const detectedLang = detect(finalSegmentText);
705
+ const supportedLangs = ['en', ...window.MULTILINGUAL_PUNCT_LANGS];
706
+ if (supportedLangs.includes(detectedLang)) {
707
+ // Strip existing punctuation before applying punctuation model
708
+ const stripped = finalSegmentText.replace(/[.,!?]/g, ' ').replace(/\s+/g, ' ').trim();
709
+ finalSegmentText = await applyPunctuation(stripped, detectedLang);
710
+ // Replace unknown tokens from punctuator with spaces
711
+ finalSegmentText = finalSegmentText.replace(/<unk>/gi, ' ').replace(/\s+/g, ' ').trim();
712
+ }
713
+ }
714
+ const escaped = finalSegmentText.replace(/</g, '&lt;').replace(/>/g, '&gt;');
715
+ displayedResults.push(makeRow(timestamp, escaped));
716
+ outputText.innerHTML = displayedResults.join('');
717
+ outputText.scrollTop = outputText.scrollHeight;
718
+ }
719
+ }
720
+
721
+ // Final output
722
+ if (displayedResults.length === 0) {
723
+ outputText.innerHTML = '<span style="color: #94a3b8;">(No speech detected)</span>';
724
+ }
725
+ copyBtn.disabled = false;
726
+
727
+ showProgress(false);
728
+ setStatus('ready', 'Transcription complete');
729
+
730
+ } catch (error) {
731
+ console.error('Transcription failed:', error);
732
+ setStatus('error', `Error: ${error.message}`);
733
+ showProgress(false);
734
+ }
735
+
736
+ transcribeBtn.disabled = false;
737
+ }
738
+
739
+ // Audio recording
740
+ let isRecording = false;
741
+
742
+ function toggleRecording() {
743
+ if (isRecording) {
744
+ stopRecording();
745
+ } else {
746
+ startRecording();
747
+ }
748
+ }
749
+
750
+ async function startRecording() {
751
+ try {
752
+ const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
753
+
754
+ mediaRecorder = new MediaRecorder(stream);
755
+ audioChunks = [];
756
+
757
+ mediaRecorder.ondataavailable = (event) => {
758
+ audioChunks.push(event.data);
759
+ };
760
+
761
+ mediaRecorder.onstop = async () => {
762
+ const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
763
+ const audioUrl = URL.createObjectURL(audioBlob);
764
+ audioPlayer.src = audioUrl;
765
+ audioPreview.style.display = 'flex';
766
+ transcribeSection.style.display = 'flex';
767
+
768
+ await processAudioBlob(audioBlob);
769
+ drawWaveform();
770
+ updateAudioTime();
771
+ stream.getTracks().forEach(track => track.stop());
772
+ };
773
+
774
+ mediaRecorder.start();
775
+ isRecording = true;
776
+ setStatus('recording', 'Recording...');
777
+
778
+ // Update button UI
779
+ recordBtn.querySelector('.mic-icon').style.display = 'none';
780
+ recordBtn.querySelector('.stop-icon').style.display = 'block';
781
+ recordBtn.querySelector('span').textContent = 'Stop';
782
+ recordBtn.classList.add('recording');
783
+
784
+ } catch (error) {
785
+ console.error('Recording failed:', error);
786
+ setStatus('error', 'Microphone access denied');
787
+ }
788
+ }
789
+
790
+ function stopRecording() {
791
+ if (mediaRecorder && mediaRecorder.state !== 'inactive') {
792
+ mediaRecorder.stop();
793
+ isRecording = false;
794
+ setStatus('ready', 'Recording stopped - Click Transcribe');
795
+
796
+ // Update button UI
797
+ recordBtn.querySelector('.mic-icon').style.display = 'block';
798
+ recordBtn.querySelector('.stop-icon').style.display = 'none';
799
+ recordBtn.querySelector('span').textContent = 'Record';
800
+ recordBtn.classList.remove('recording');
801
+ }
802
+ }
803
+
804
+ // Process audio file/blob
805
+ async function processAudioBlob(blob) {
806
+ try {
807
+ const arrayBuffer = await blob.arrayBuffer();
808
+ const audioCtx = new AudioContext({ sampleRate: SAMPLE_RATE });
809
+ const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);
810
+
811
+ // Convert to mono Float32Array
812
+ let audioData;
813
+ if (audioBuffer.numberOfChannels > 1) {
814
+ const left = audioBuffer.getChannelData(0);
815
+ const right = audioBuffer.getChannelData(1);
816
+ audioData = new Float32Array(left.length);
817
+ for (let i = 0; i < left.length; i++) {
818
+ audioData[i] = (left[i] + right[i]) / 2;
819
+ }
820
+ } else {
821
+ audioData = audioBuffer.getChannelData(0);
822
+ }
823
+
824
+ // Resample if needed
825
+ if (audioBuffer.sampleRate !== SAMPLE_RATE) {
826
+ audioData = resample(audioData, audioBuffer.sampleRate, SAMPLE_RATE);
827
+ }
828
+
829
+ currentAudioData = audioData;
830
+ transcribeBtn.disabled = false;
831
+
832
+ } catch (error) {
833
+ console.error('Audio processing failed:', error);
834
+ setStatus('error', 'Failed to process audio');
835
+ }
836
+ }
837
+
838
+ // Simple linear resampling
839
+ function resample(audioData, fromRate, toRate) {
840
+ const ratio = fromRate / toRate;
841
+ const newLength = Math.round(audioData.length / ratio);
842
+ const result = new Float32Array(newLength);
843
+
844
+ for (let i = 0; i < newLength; i++) {
845
+ const srcIndex = i * ratio;
846
+ const srcIndexFloor = Math.floor(srcIndex);
847
+ const srcIndexCeil = Math.min(srcIndexFloor + 1, audioData.length - 1);
848
+ const t = srcIndex - srcIndexFloor;
849
+ result[i] = audioData[srcIndexFloor] * (1 - t) + audioData[srcIndexCeil] * t;
850
+ }
851
+
852
+ return result;
853
+ }
854
+
855
+ // Handle file upload
856
+ async function handleFileUpload(event) {
857
+ const file = event.target.files[0];
858
+ if (!file) return;
859
+ await loadAudioFile(file);
860
+ }
861
+
862
+ // Handle dropped files
863
+ async function handleFileDrop(event) {
864
+ event.preventDefault();
865
+ inputCard.classList.remove('drag-over');
866
+
867
+ const file = event.dataTransfer.files[0];
868
+ if (!file || !file.type.startsWith('audio/')) {
869
+ setStatus('error', 'Please drop an audio file');
870
+ return;
871
+ }
872
+ await loadAudioFile(file);
873
+ }
874
+
875
+ // Common file loading logic
876
+ async function loadAudioFile(file) {
877
+ setStatus('processing', 'Processing audio file...');
878
+
879
+ const audioUrl = URL.createObjectURL(file);
880
+ audioPlayer.src = audioUrl;
881
+ audioPreview.style.display = 'flex';
882
+ transcribeSection.style.display = 'flex';
883
+
884
+ await processAudioBlob(file);
885
+ drawWaveform();
886
+ updateAudioTime();
887
+ setStatus('ready', 'Audio loaded - Click Transcribe');
888
+ }
889
+
890
+ // Draw waveform visualization
891
+ function drawWaveform() {
892
+ if (!currentAudioData) return;
893
+
894
+ const canvas = waveformCanvas;
895
+ const ctx = canvas.getContext('2d');
896
+ const dpr = window.devicePixelRatio || 1;
897
+
898
+ // Set canvas size
899
+ const rect = canvas.getBoundingClientRect();
900
+ canvas.width = rect.width * dpr;
901
+ canvas.height = rect.height * dpr;
902
+ ctx.scale(dpr, dpr);
903
+
904
+ const width = rect.width;
905
+ const height = rect.height;
906
+ const centerY = height / 2;
907
+
908
+ // Downsample audio data for visualization
909
+ const samples = currentAudioData;
910
+ const barCount = Math.floor(width / 3);
911
+ const samplesPerBar = Math.floor(samples.length / barCount);
912
+
913
+ // Calculate bar amplitudes
914
+ const barAmplitudes = [];
915
+ for (let i = 0; i < barCount; i++) {
916
+ let sum = 0;
917
+ const start = i * samplesPerBar;
918
+ for (let j = 0; j < samplesPerBar; j++) {
919
+ sum += Math.abs(samples[start + j] || 0);
920
+ }
921
+ barAmplitudes.push(sum / samplesPerBar);
922
+ }
923
+
924
+ // Find max amplitude for normalization
925
+ const maxAmp = Math.max(...barAmplitudes, 0.01);
926
+
927
+ // Get color based on color scheme
928
+ const isDark = window.matchMedia('(prefers-color-scheme: dark)').matches;
929
+ ctx.fillStyle = isDark ? '#64748b' : '#cbd5e1';
930
+
931
+ // Draw bars normalized to fill height
932
+ for (let i = 0; i < barCount; i++) {
933
+ const normalized = barAmplitudes[i] / maxAmp;
934
+ const barHeight = Math.max(2, normalized * height * 0.9);
935
+
936
+ ctx.fillRect(i * 3, centerY - barHeight / 2, 2, barHeight);
937
+ }
938
+ }
939
+
940
+ // Format time as M:SS
941
+ function formatTime(seconds) {
942
+ const mins = Math.floor(seconds / 60);
943
+ const secs = Math.floor(seconds % 60);
944
+ return `${mins}:${secs.toString().padStart(2, '0')}`;
945
+ }
946
+
947
+ // Update audio time display
948
+ function updateAudioTime() {
949
+ const current = audioPlayer.currentTime || 0;
950
+ const duration = audioPlayer.duration || 0;
951
+ if (duration > 0) {
952
+ audioTime.textContent = `${formatTime(current)} / ${formatTime(duration)}`;
953
+ waveformProgress.style.width = `${(current / duration) * 100}%`;
954
+ } else {
955
+ audioTime.textContent = formatTime(currentAudioData ? currentAudioData.length / SAMPLE_RATE : 0);
956
+ }
957
+ }
958
+
959
+ // Toggle play/pause
960
+ function togglePlayback() {
961
+ if (audioPlayer.paused) {
962
+ audioPlayer.play();
963
+ playBtn.querySelector('.play-icon').style.display = 'none';
964
+ playBtn.querySelector('.pause-icon').style.display = 'block';
965
+ } else {
966
+ audioPlayer.pause();
967
+ playBtn.querySelector('.play-icon').style.display = 'block';
968
+ playBtn.querySelector('.pause-icon').style.display = 'none';
969
+ // Stop transcription if running
970
+ if (!transcriptionAborted && transcribeBtn.disabled) {
971
+ transcriptionAborted = true;
972
+ showProgress(false);
973
+ setStatus('ready', 'Transcription stopped');
974
+ transcribeBtn.disabled = false;
975
+ }
976
+ }
977
+ }
978
+
979
+ // Seek in audio
980
+ function seekAudio(event) {
981
+ const rect = waveformCanvas.getBoundingClientRect();
982
+ const x = event.clientX - rect.left;
983
+ const percent = x / rect.width;
984
+ audioPlayer.currentTime = percent * audioPlayer.duration;
985
+ updateAudioTime();
986
+ }
987
+
988
+ // Copy to clipboard
989
+ async function copyToClipboard() {
990
+ try {
991
+ await navigator.clipboard.writeText(outputText.textContent);
992
+ // Brief visual feedback via title attribute
993
+ const originalTitle = copyBtn.title;
994
+ copyBtn.title = 'Copied!';
995
+ setTimeout(() => {
996
+ copyBtn.title = originalTitle;
997
+ }, 2000);
998
+ } catch (error) {
999
+ console.error('Copy failed:', error);
1000
+ }
1001
+ }
1002
+
1003
+ function downloadTranscript() {
1004
+ // Convert <br> to newlines and strip other HTML
1005
+ const text = outputText.innerHTML
1006
+ .replace(/<br\s*\/?>/gi, '\n')
1007
+ .replace(/<[^>]+>/g, '');
1008
+ if (!text) return;
1009
+
1010
+ const blob = new Blob([text], { type: 'text/plain' });
1011
+ const url = URL.createObjectURL(blob);
1012
+ const a = document.createElement('a');
1013
+ a.href = url;
1014
+ a.download = 'transcript.txt';
1015
+ document.body.appendChild(a);
1016
+ a.click();
1017
+ document.body.removeChild(a);
1018
+ URL.revokeObjectURL(url);
1019
+
1020
+ // Brief visual feedback
1021
+ const originalTitle = downloadBtn.title;
1022
+ downloadBtn.title = 'Downloaded!';
1023
+ setTimeout(() => {
1024
+ downloadBtn.title = originalTitle;
1025
+ }, 2000);
1026
+ }
1027
+
1028
+ function clearAudio() {
1029
+ // Stop any playback
1030
+ audioPlayer.pause();
1031
+ audioPlayer.src = '';
1032
+
1033
+ // Reset audio state
1034
+ currentAudioData = null;
1035
+
1036
+ // Hide audio player and transcribe section
1037
+ audioPreview.style.display = 'none';
1038
+ transcribeSection.style.display = 'none';
1039
+
1040
+ // Clear transcript
1041
+ transcriptCard.style.display = 'none';
1042
+ outputText.textContent = '';
1043
+
1044
+ // Reset waveform
1045
+ waveformProgress.style.width = '0%';
1046
+ const ctx = waveformCanvas.getContext('2d');
1047
+ ctx.clearRect(0, 0, waveformCanvas.width, waveformCanvas.height);
1048
+
1049
+ // Reset time display
1050
+ audioTime.textContent = '0:00';
1051
+
1052
+ // Reset buttons
1053
+ transcribeBtn.disabled = true;
1054
+
1055
+ // Reset file input
1056
+ audioFile.value = '';
1057
+
1058
+ // Update status
1059
+ setStatus('ready', 'Ready');
1060
+ }
1061
+
1062
+ // Event listeners
1063
+ recordBtn.addEventListener('click', toggleRecording);
1064
+ audioFile.addEventListener('change', handleFileUpload);
1065
+
1066
+ // Audio player controls
1067
+ playBtn.addEventListener('click', togglePlayback);
1068
+ waveformCanvas.addEventListener('click', seekAudio);
1069
+ audioPlayer.addEventListener('timeupdate', updateAudioTime);
1070
+ audioPlayer.addEventListener('ended', () => {
1071
+ playBtn.querySelector('.play-icon').style.display = 'block';
1072
+ playBtn.querySelector('.pause-icon').style.display = 'none';
1073
+ waveformProgress.style.width = '0%';
1074
+ });
1075
+
1076
+ // Redraw waveform on resize
1077
+ window.addEventListener('resize', drawWaveform);
1078
+ transcribeBtn.addEventListener('click', transcribe);
1079
+ copyBtn.addEventListener('click', copyToClipboard);
1080
+ downloadBtn.addEventListener('click', downloadTranscript);
1081
+ clearBtn.addEventListener('click', clearAudio);
1082
+
1083
+ // Drag and drop on input card
1084
+ inputCard.addEventListener('dragover', (e) => {
1085
+ e.preventDefault();
1086
+ inputCard.classList.add('drag-over');
1087
+ });
1088
+ inputCard.addEventListener('dragleave', (e) => {
1089
+ e.preventDefault();
1090
+ inputCard.classList.remove('drag-over');
1091
+ });
1092
+ inputCard.addEventListener('drop', handleFileDrop);
1093
+
1094
+ // Initialize on load
1095
+ window.addEventListener('load', initModels);
index.html CHANGED
@@ -1,19 +1,135 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  </html>
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Granite Speech WebGPU</title>
7
+ <link rel="stylesheet" href="style.css?v=3">
8
+ </head>
9
+ <body>
10
+ <div class="container">
11
+ <!-- Header -->
12
+ <h1>Granite Speech WebGPU</h1>
13
+ <h2>Speech recognition and translation directly in your browser</h2>
14
+
15
+ <!-- Audio Input Card -->
16
+ <div class="input-card">
17
+ <div class="input-options">
18
+ <button id="recordBtn" class="input-tile" disabled>
19
+ <svg class="mic-icon" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor">
20
+ <path stroke-linecap="round" stroke-linejoin="round" d="M12 18.75a6 6 0 006-6v-1.5m-6 7.5a6 6 0 01-6-6v-1.5m6 7.5v3.75m-3.75 0h7.5M12 15.75a3 3 0 01-3-3V4.5a3 3 0 116 0v8.25a3 3 0 01-3 3z" />
21
+ </svg>
22
+ <svg class="stop-icon" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor" style="display: none;">
23
+ <rect x="6" y="6" width="12" height="12" rx="1" />
24
+ </svg>
25
+ <span>Record</span>
26
+ </button>
27
+ <div class="divider"></div>
28
+ <label class="input-tile file-label">
29
+ <svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor">
30
+ <path stroke-linecap="round" stroke-linejoin="round" d="M3.75 9.776c.112-.017.227-.026.344-.026h15.812c.117 0 .232.009.344.026m-16.5 0a2.25 2.25 0 00-1.883 2.542l.857 6a2.25 2.25 0 002.227 1.932H19.05a2.25 2.25 0 002.227-1.932l.857-6a2.25 2.25 0 00-1.883-2.542m-16.5 0V6A2.25 2.25 0 016 3.75h3.879a1.5 1.5 0 011.06.44l2.122 2.12a1.5 1.5 0 001.06.44H18A2.25 2.25 0 0120.25 9v.776" />
31
+ </svg>
32
+ <span>Upload</span>
33
+ <input type="file" id="audioFile" accept="audio/*" hidden>
34
+ </label>
35
+ </div>
36
+ <div class="progress-bar">
37
+ <div class="progress-fill" id="progressFill"></div>
38
+ </div>
39
+ </div>
40
+
41
+ <!-- Status -->
42
+ <div class="status-section" id="statusSection">
43
+ <span class="status-dot" id="statusDot"></span>
44
+ <span id="statusText">Loading...</span>
45
+ </div>
46
+
47
+ <!-- Audio Player (hidden initially) -->
48
+ <div class="audio-player" id="audioPreview" style="display: none;">
49
+ <button class="play-btn" id="playBtn">
50
+ <svg class="play-icon" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="currentColor">
51
+ <path d="M8 5v14l11-7z"/>
52
+ </svg>
53
+ <svg class="pause-icon" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="currentColor" style="display: none;">
54
+ <path d="M6 19h4V5H6v14zm8-14v14h4V5h-4z"/>
55
+ </svg>
56
+ </button>
57
+ <div class="waveform-container">
58
+ <canvas id="waveformCanvas"></canvas>
59
+ <div class="waveform-progress" id="waveformProgress"></div>
60
+ </div>
61
+ <span class="audio-time" id="audioTime">0:00</span>
62
+ <button class="clear-btn" id="clearBtn" title="Clear">
63
+ <svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor">
64
+ <path stroke-linecap="round" stroke-linejoin="round" d="M6 18L18 6M6 6l12 12" />
65
+ </svg>
66
+ </button>
67
+ <audio id="audioPlayer" style="display: none;"></audio>
68
+ </div>
69
+
70
+ <!-- Transcribe Section -->
71
+ <div class="transcribe-section" id="transcribeSection" style="display: none;">
72
+ <div class="task-row">
73
+ <label for="promptSelect">Task:</label>
74
+ <select id="promptSelect">
75
+ <option value="transcribe">Transcribe</option>
76
+ <option value="translate_en">Translate to English</option>
77
+ <option value="translate_fr">Translate to French</option>
78
+ <option value="translate_de">Translate to German</option>
79
+ <option value="translate_es">Translate to Spanish</option>
80
+ <option value="translate_pt">Translate to Portuguese</option>
81
+ <option value="translate_ja">Translate to Japanese</option>
82
+ </select>
83
+ </div>
84
+ <label class="checkbox-row">
85
+ <input type="checkbox" id="punctuationCheckbox" checked>
86
+ <span>Add punctuation (English only)</span>
87
+ </label>
88
+ <button id="transcribeBtn" class="transcribe-btn" disabled>
89
+ Transcribe
90
+ </button>
91
+ </div>
92
+
93
+ <!-- Model Loading Progress -->
94
+ <div class="model-progress" id="progressSection" style="display: none;">
95
+ <span id="progressText">Loading model...</span>
96
+ </div>
97
+
98
+ <!-- Transcript Output -->
99
+ <div class="transcript-card" id="transcriptCard" style="display: none;">
100
+ <div class="transcript-header">
101
+ <span>Transcript</span>
102
+ <div class="transcript-actions">
103
+ <button id="copyBtn" class="icon-btn" title="Copy">
104
+ <svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor">
105
+ <path stroke-linecap="round" stroke-linejoin="round" d="M15.666 3.888A2.25 2.25 0 0013.5 2.25h-3c-1.03 0-1.9.693-2.166 1.638m7.332 0c.055.194.084.4.084.612v0a.75.75 0 01-.75.75H9.75a.75.75 0 01-.75-.75v0c0-.212.03-.418.084-.612m7.332 0c.646.049 1.288.11 1.927.184 1.1.128 1.907 1.077 1.907 2.185V19.5a2.25 2.25 0 01-2.25 2.25H6.75A2.25 2.25 0 014.5 19.5V6.257c0-1.108.806-2.057 1.907-2.185a48.208 48.208 0 011.927-.184" />
106
+ </svg>
107
+ </button>
108
+ <button id="downloadBtn" class="icon-btn" title="Download">
109
+ <svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor">
110
+ <path stroke-linecap="round" stroke-linejoin="round" d="M3 16.5v2.25A2.25 2.25 0 005.25 21h13.5A2.25 2.25 0 0021 18.75V16.5M16.5 12L12 16.5m0 0L7.5 12m4.5 4.5V3" />
111
+ </svg>
112
+ </button>
113
+ </div>
114
+ </div>
115
+ <div class="transcript-output" id="outputText"></div>
116
+ </div>
117
+
118
+ <!-- Footer -->
119
+ <div class="footer">
120
+ Made with
121
+ <a href="https://huggingface.co/ibm-granite/granite-4.0-1b-speech" target="_blank">Granite Speech 4.0 1B</a>
122
+ and
123
+ <a href="https://onnxruntime.ai/docs/tutorials/web/" target="_blank">ONNX Runtime Web</a>
124
+ <br>
125
+ <span class="privacy-note">Your audio and transcription never leave your device</span>
126
+ </div>
127
+ <div class="gpu-info" id="gpuInfo"></div>
128
+ </div>
129
+
130
+ <script src="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.24.3/dist/ort.all.min.js"></script>
131
+ <script src="vad.js?v=1"></script>
132
+ <script src="punctuator.js?v=3"></script>
133
+ <script type="module" src="app.js?v=53"></script>
134
+ </body>
135
  </html>
pcs_vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
punct_cap_seg_en.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd922d459da618cd324280889740608b76fb3e9e61d3f402291be1251f91421b
3
+ size 209532928
punctuator.js ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Punctuation and Capitalization using ONNX
3
+ * - English: Full punctuation + capitalization (1-800-BAD-CODE model)
4
+ * - Other languages (DE, FR, IT, NL, ES, PT): Punctuation only (oliverguhr multilingual model)
5
+ */
6
+
7
+ // English model (punctuation + capitalization)
8
+ let pcsSession = null;
9
+ let pcsVocab = null;
10
+ let pcsVocabReverse = null;
11
+
12
+ // Multilingual model (punctuation only)
13
+ let multilingualSession = null;
14
+ let multilingualTokenizer = null;
15
+
16
+ const PCS_CONFIG = {
17
+ preLabels: ["<NULL>", "¿"],
18
+ postLabels: ["<NULL>", "<ACRONYM>", ".", ",", "?"],
19
+ unkId: 0,
20
+ bosId: 1,
21
+ eosId: 2,
22
+ padId: 3,
23
+ };
24
+
25
+ // Multilingual model label mapping
26
+ const MULTILINGUAL_LABELS = {
27
+ 0: "", // No punctuation
28
+ 1: ".", // Period
29
+ 2: ",", // Comma
30
+ 3: "?", // Question mark
31
+ 4: "-", // Hyphen
32
+ 5: ":", // Colon
33
+ };
34
+
35
+ // Languages supported by multilingual model
36
+ const MULTILINGUAL_LANGS = ['de', 'fr', 'it', 'nl', 'es', 'pt'];
37
+
38
+ // Load the English punctuator model and vocab
39
+ async function loadEnglishPunctuator() {
40
+ if (pcsSession) return;
41
+
42
+ console.log('Loading English punctuator model...');
43
+
44
+ // Load vocab
45
+ const vocabResponse = await fetch('./pcs_vocab.json');
46
+ const vocabData = await vocabResponse.json();
47
+ pcsVocab = vocabData.vocab;
48
+
49
+ // Create reverse vocab (id -> piece)
50
+ pcsVocabReverse = {};
51
+ for (const [piece, id] of Object.entries(pcsVocab)) {
52
+ pcsVocabReverse[id] = piece;
53
+ }
54
+
55
+ // Load ONNX model
56
+ pcsSession = await ort.InferenceSession.create('./punct_cap_seg_en.onnx', {
57
+ executionProviders: ['wasm'],
58
+ });
59
+
60
+ console.log('English punctuator model loaded');
61
+ }
62
+
63
+ // Load the multilingual punctuator model
64
+ async function loadMultilingualPunctuator() {
65
+ if (multilingualSession) return;
66
+
67
+ console.log('Loading multilingual punctuator model...');
68
+
69
+ // Load tokenizer from transformers.js
70
+ const { AutoTokenizer } = await import('https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.4.2');
71
+ multilingualTokenizer = await AutoTokenizer.from_pretrained('oliverguhr/fullstop-punctuation-multilingual-base');
72
+
73
+ // Load ONNX model
74
+ multilingualSession = await ort.InferenceSession.create('./punct_multilingual_q8.onnx', {
75
+ executionProviders: ['wasm'],
76
+ });
77
+
78
+ console.log('Multilingual punctuator model loaded');
79
+ }
80
+
81
+ // Simple Unigram tokenizer for English model (greedy longest match)
82
+ function tokenizeEnglish(text) {
83
+ const normalized = text.toLowerCase().replace(/ /g, '▁');
84
+ const tokens = [];
85
+ let i = 0;
86
+
87
+ // Add BOS
88
+ tokens.push(PCS_CONFIG.bosId);
89
+
90
+ // Prepend ▁ for first word
91
+ let remaining = '▁' + normalized;
92
+
93
+ while (remaining.length > 0) {
94
+ let found = false;
95
+ // Try longest match first
96
+ for (let len = Math.min(remaining.length, 20); len > 0; len--) {
97
+ const piece = remaining.substring(0, len);
98
+ if (pcsVocab[piece] !== undefined) {
99
+ tokens.push(pcsVocab[piece]);
100
+ remaining = remaining.substring(len);
101
+ found = true;
102
+ break;
103
+ }
104
+ }
105
+ if (!found) {
106
+ // Unknown character, use UNK and skip
107
+ tokens.push(PCS_CONFIG.unkId);
108
+ remaining = remaining.substring(1);
109
+ }
110
+ }
111
+
112
+ // Add EOS
113
+ tokens.push(PCS_CONFIG.eosId);
114
+
115
+ return tokens;
116
+ }
117
+
118
+ // Apply punctuation and capitalization for English
119
+ async function applyEnglishPunctuation(text) {
120
+ await loadEnglishPunctuator();
121
+
122
+ // Tokenize
123
+ const tokenIds = tokenizeEnglish(text);
124
+
125
+ // Run inference
126
+ const inputTensor = new ort.Tensor('int64', BigInt64Array.from(tokenIds.map(BigInt)), [1, tokenIds.length]);
127
+ const outputs = await pcsSession.run({ input_ids: inputTensor });
128
+
129
+ const prePreds = outputs.pre_preds.data;
130
+ const postPreds = outputs.post_preds.data;
131
+ const capPreds = outputs.cap_preds.data;
132
+ const segPreds = outputs.seg_preds.data;
133
+
134
+ // Decode: skip BOS (index 0) and EOS (last index)
135
+ const numTokens = tokenIds.length - 2;
136
+ const result = [];
137
+ let currentSentence = [];
138
+
139
+ for (let i = 0; i < numTokens; i++) {
140
+ const tokenId = tokenIds[i + 1];
141
+ const token = pcsVocabReverse[tokenId] || '';
142
+ const outputIdx = i + 1;
143
+
144
+ // Handle word boundary
145
+ if (token.startsWith('▁') && currentSentence.length > 0) {
146
+ currentSentence.push(' ');
147
+ }
148
+
149
+ // Process each character in token
150
+ const charStart = token.startsWith('▁') ? 1 : 0;
151
+ for (let j = charStart; j < token.length; j++) {
152
+ let char = token[j];
153
+
154
+ // Pre-punctuation (e.g., inverted question mark)
155
+ if (j === charStart && prePreds[outputIdx] === 1) {
156
+ currentSentence.push(PCS_CONFIG.preLabels[1]);
157
+ }
158
+
159
+ // Capitalization - capPreds is [batch, seq, 16]
160
+ const capOffset = outputIdx * 16 + j;
161
+ if (capPreds[capOffset]) {
162
+ char = char.toUpperCase();
163
+ }
164
+
165
+ currentSentence.push(char);
166
+
167
+ // Post-punctuation
168
+ const postLabel = postPreds[outputIdx];
169
+ if (postLabel === 1) { // ACRONYM
170
+ currentSentence.push('.');
171
+ } else if (j === token.length - 1 && postLabel > 1) {
172
+ currentSentence.push(PCS_CONFIG.postLabels[postLabel]);
173
+ }
174
+ }
175
+
176
+ // Sentence boundary
177
+ if (segPreds[outputIdx]) {
178
+ result.push(currentSentence.join(''));
179
+ currentSentence = [];
180
+ }
181
+ }
182
+
183
+ if (currentSentence.length > 0) {
184
+ result.push(currentSentence.join(''));
185
+ }
186
+
187
+ return result.join(' ');
188
+ }
189
+
190
+ // Apply punctuation only for other languages (multilingual model)
191
+ async function applyMultilingualPunctuation(text) {
192
+ await loadMultilingualPunctuator();
193
+
194
+ // Tokenize using transformers.js tokenizer
195
+ const encoded = await multilingualTokenizer(text, {
196
+ return_tensors: false,
197
+ padding: false,
198
+ truncation: true,
199
+ max_length: 512,
200
+ });
201
+
202
+ const inputIds = encoded.input_ids;
203
+ const attentionMask = encoded.attention_mask;
204
+
205
+ // Run inference
206
+ const inputIdsTensor = new ort.Tensor('int64', BigInt64Array.from(inputIds.map(BigInt)), [1, inputIds.length]);
207
+ const attentionMaskTensor = new ort.Tensor('int64', BigInt64Array.from(attentionMask.map(BigInt)), [1, attentionMask.length]);
208
+
209
+ const outputs = await multilingualSession.run({
210
+ input_ids: inputIdsTensor,
211
+ attention_mask: attentionMaskTensor,
212
+ });
213
+
214
+ const logits = outputs.logits.data;
215
+ const numLabels = 6;
216
+
217
+ // Get predictions (argmax over logits)
218
+ const predictions = [];
219
+ for (let i = 0; i < inputIds.length; i++) {
220
+ let maxIdx = 0;
221
+ let maxVal = logits[i * numLabels];
222
+ for (let j = 1; j < numLabels; j++) {
223
+ if (logits[i * numLabels + j] > maxVal) {
224
+ maxVal = logits[i * numLabels + j];
225
+ maxIdx = j;
226
+ }
227
+ }
228
+ predictions.push(maxIdx);
229
+ }
230
+
231
+ // Decode tokens back to text with punctuation
232
+ const tokens = multilingualTokenizer.model.convert_ids_to_tokens(inputIds);
233
+ const result = [];
234
+
235
+ for (let i = 0; i < tokens.length; i++) {
236
+ const token = tokens[i];
237
+
238
+ // Skip special tokens
239
+ if (token === '<s>' || token === '</s>' || token === '<pad>') {
240
+ continue;
241
+ }
242
+
243
+ // Handle subword tokens (▁ prefix indicates start of new word)
244
+ if (token.startsWith('▁')) {
245
+ if (result.length > 0) {
246
+ result.push(' ');
247
+ }
248
+ result.push(token.substring(1));
249
+ } else {
250
+ result.push(token);
251
+ }
252
+
253
+ // Add punctuation after token
254
+ const punct = MULTILINGUAL_LABELS[predictions[i]];
255
+ if (punct) {
256
+ result.push(punct);
257
+ }
258
+ }
259
+
260
+ return result.join('');
261
+ }
262
+
263
+ // Main entry point - routes to appropriate model based on language
264
+ async function applyPunctuation(text, lang = null) {
265
+ if (!text || text.trim().length === 0) return text;
266
+
267
+ // If language specified and supported by multilingual model, use it
268
+ if (lang && MULTILINGUAL_LANGS.includes(lang)) {
269
+ try {
270
+ return await applyMultilingualPunctuation(text);
271
+ } catch (error) {
272
+ console.warn('Multilingual punctuation failed, returning original:', error);
273
+ return text;
274
+ }
275
+ }
276
+
277
+ // Default to English model
278
+ try {
279
+ return await applyEnglishPunctuation(text);
280
+ } catch (error) {
281
+ console.warn('English punctuation failed, returning original:', error);
282
+ return text;
283
+ }
284
+ }
285
+
286
+ // Preload English model (called during init)
287
+ async function loadPunctuator() {
288
+ await loadEnglishPunctuator();
289
+ }
290
+
291
+ // Export for use in app.js
292
+ window.applyPunctuation = applyPunctuation;
293
+ window.loadPunctuator = loadPunctuator;
294
+ window.MULTILINGUAL_PUNCT_LANGS = MULTILINGUAL_LANGS;
silero_vad.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
3
+ size 2243022
style.css CHANGED
@@ -1,28 +1,638 @@
 
 
 
 
 
 
 
 
1
  body {
2
- padding: 2rem;
3
- font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  }
5
 
 
6
  h1 {
7
- font-size: 16px;
8
- margin-top: 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  }
10
 
11
- p {
12
- color: rgb(107, 114, 128);
13
- font-size: 15px;
14
- margin-bottom: 10px;
15
- margin-top: 5px;
16
  }
17
 
18
- .card {
19
- max-width: 620px;
20
- margin: 0 auto;
21
- padding: 16px;
22
- border: 1px solid lightgray;
23
- border-radius: 16px;
 
 
 
 
24
  }
25
 
26
- .card p:last-child {
27
- margin-bottom: 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  }
 
1
+ /* Granite Speech WebGPU - Whisper Web Style */
2
+
3
+ * {
4
+ box-sizing: border-box;
5
+ margin: 0;
6
+ padding: 0;
7
+ }
8
+
9
  body {
10
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
11
+ background: #f8fafc;
12
+ color: #0f172a;
13
+ min-height: 100vh;
14
+ display: flex;
15
+ justify-content: center;
16
+ align-items: center;
17
+ padding: 2rem;
18
+ line-height: 1.5;
19
+ }
20
+
21
+ .container {
22
+ width: 100%;
23
+ max-width: 540px;
24
+ display: flex;
25
+ flex-direction: column;
26
+ align-items: center;
27
+ gap: 1.5rem;
28
  }
29
 
30
+ /* Header */
31
  h1 {
32
+ font-size: 3rem;
33
+ font-weight: 800;
34
+ letter-spacing: -0.025em;
35
+ color: #0f172a;
36
+ text-align: center;
37
+ }
38
+
39
+ h2 {
40
+ font-size: 1.25rem;
41
+ font-weight: 600;
42
+ letter-spacing: -0.015em;
43
+ color: #0f172a;
44
+ text-align: center;
45
+ padding: 0 1rem;
46
+ white-space: nowrap;
47
+ }
48
+
49
+ .privacy-note {
50
+ font-size: 0.8125rem;
51
+ color: #64748b;
52
+ text-align: center;
53
+ }
54
+
55
+ /* Input Card */
56
+ .input-card {
57
+ width: 100%;
58
+ background: white;
59
+ border-radius: 0.5rem;
60
+ box-shadow: 0 25px 50px -12px rgba(0, 0, 0, 0.05);
61
+ border: 1px solid rgba(51, 65, 85, 0.1);
62
+ overflow: hidden;
63
+ }
64
+
65
+ .input-options {
66
+ display: flex;
67
+ align-items: stretch;
68
+ padding: 0.5rem;
69
+ gap: 0.5rem;
70
+ }
71
+
72
+ .input-tile {
73
+ display: flex;
74
+ align-items: center;
75
+ justify-content: center;
76
+ gap: 0.5rem;
77
+ padding: 0.5rem 0.75rem;
78
+ background: transparent;
79
+ border: none;
80
+ border-radius: 0.5rem;
81
+ color: #64748b;
82
+ font-size: 0.9375rem;
83
+ cursor: pointer;
84
+ transition: all 0.2s;
85
+ flex: 1;
86
+ }
87
+
88
+ .input-tile:hover:not(:disabled) {
89
+ color: #4f46e5;
90
+ background: #eef2ff;
91
+ }
92
+
93
+ .input-tile.recording {
94
+ color: #ef4444;
95
+ background: #fef2f2;
96
+ }
97
+
98
+ .input-tile.recording:hover {
99
+ color: #dc2626;
100
+ background: #fee2e2;
101
+ }
102
+
103
+ .input-tile:disabled {
104
+ opacity: 0.5;
105
+ cursor: not-allowed;
106
+ }
107
+
108
+ .input-tile svg {
109
+ width: 1.75rem;
110
+ height: 1.75rem;
111
+ flex-shrink: 0;
112
+ }
113
+
114
+ .file-label {
115
+ cursor: pointer;
116
+ transition: all 0.2s;
117
+ }
118
+
119
+ .file-label.drag-over {
120
+ color: #2563eb;
121
+ background: #eff6ff;
122
+ }
123
+
124
+ .input-card.drag-over {
125
+ border: 2px dashed #2563eb;
126
+ background: #eff6ff;
127
+ }
128
+
129
+ .divider {
130
+ width: 1px;
131
+ background: #e2e8f0;
132
+ margin: 0.25rem 0;
133
+ }
134
+
135
+ /* Progress Bar */
136
+ .progress-bar {
137
+ width: 100%;
138
+ height: 4px;
139
+ background: #e5e7eb;
140
+ }
141
+
142
+ .progress-fill {
143
+ height: 100%;
144
+ background: #2563eb;
145
+ width: 0%;
146
+ transition: width 0.1s;
147
+ border-radius: 0 2px 2px 0;
148
+ }
149
+
150
+ /* Status Section */
151
+ .status-section {
152
+ display: flex;
153
+ align-items: center;
154
+ gap: 0.5rem;
155
+ font-size: 0.875rem;
156
+ color: #64748b;
157
+ }
158
+
159
+ .status-dot {
160
+ width: 8px;
161
+ height: 8px;
162
+ border-radius: 50%;
163
+ background: #94a3b8;
164
+ }
165
+
166
+ .status-dot.ready {
167
+ background: #22c55e;
168
+ }
169
+
170
+ .status-dot.loading,
171
+ .status-dot.processing {
172
+ background: #f59e0b;
173
+ animation: pulse 1.5s infinite;
174
+ }
175
+
176
+ .status-dot.recording {
177
+ background: #ef4444;
178
+ animation: pulse 0.8s infinite;
179
+ }
180
+
181
+ .status-dot.error {
182
+ background: #ef4444;
183
+ }
184
+
185
+ @keyframes pulse {
186
+ 0%, 100% { opacity: 1; }
187
+ 50% { opacity: 0.5; }
188
+ }
189
+
190
+ /* Audio Player */
191
+ .audio-player {
192
+ width: 100%;
193
+ display: flex;
194
+ align-items: center;
195
+ gap: 0.5rem;
196
+ padding: 0.25rem 0.5rem;
197
+ background: white;
198
+ border-radius: 0.5rem;
199
+ border: 1px solid rgba(51, 65, 85, 0.1);
200
+ }
201
+
202
+ .play-btn {
203
+ width: 24px;
204
+ height: 24px;
205
+ border-radius: 50%;
206
+ border: none;
207
+ background: #2563eb;
208
+ color: white;
209
+ cursor: pointer;
210
+ display: flex;
211
+ align-items: center;
212
+ justify-content: center;
213
+ flex-shrink: 0;
214
+ transition: background 0.2s;
215
+ }
216
+
217
+ .play-btn:hover {
218
+ background: #1d4ed8;
219
+ }
220
+
221
+ .play-btn svg {
222
+ width: 14px;
223
+ height: 14px;
224
+ }
225
+
226
+ .waveform-container {
227
+ flex: 1;
228
+ height: 24px;
229
+ position: relative;
230
+ cursor: pointer;
231
+ border-radius: 4px;
232
+ overflow: hidden;
233
+ }
234
+
235
+ #waveformCanvas {
236
+ width: 100%;
237
+ height: 100%;
238
+ display: block;
239
+ }
240
+
241
+ .waveform-progress {
242
+ position: absolute;
243
+ top: 0;
244
+ left: 0;
245
+ height: 100%;
246
+ width: 0%;
247
+ background: rgba(37, 99, 235, 0.3);
248
+ pointer-events: none;
249
+ }
250
+
251
+ .audio-time {
252
+ font-size: 0.8125rem;
253
+ color: #64748b;
254
+ font-variant-numeric: tabular-nums;
255
+ min-width: 3rem;
256
+ text-align: right;
257
+ }
258
+
259
+ .clear-btn {
260
+ width: 20px;
261
+ height: 20px;
262
+ border-radius: 50%;
263
+ border: none;
264
+ background: transparent;
265
+ color: #94a3b8;
266
+ cursor: pointer;
267
+ display: flex;
268
+ align-items: center;
269
+ justify-content: center;
270
+ flex-shrink: 0;
271
+ padding: 0;
272
+ transition: all 0.2s;
273
+ }
274
+
275
+ .clear-btn:hover {
276
+ color: #ef4444;
277
+ background: #fef2f2;
278
+ }
279
+
280
+ .clear-btn svg {
281
+ width: 14px;
282
+ height: 14px;
283
+ }
284
+
285
+ /* Transcribe Section */
286
+ .transcribe-section {
287
+ width: 100%;
288
+ display: flex;
289
+ flex-direction: column;
290
+ align-items: center;
291
+ gap: 1rem;
292
+ }
293
+
294
+ .task-row {
295
+ display: flex;
296
+ align-items: center;
297
+ gap: 0.75rem;
298
+ font-size: 0.875rem;
299
+ color: #64748b;
300
+ }
301
+
302
+ .task-row select {
303
+ padding: 0.375rem 0.75rem;
304
+ font-size: 0.875rem;
305
+ background: white;
306
+ color: #0f172a;
307
+ border: 1px solid #d1d5db;
308
+ border-radius: 0.5rem;
309
+ cursor: pointer;
310
+ outline: none;
311
  }
312
 
313
+ .task-row select:focus {
314
+ border-color: #2563eb;
315
+ box-shadow: 0 0 0 2px rgba(37, 99, 235, 0.2);
 
 
316
  }
317
 
318
+ .transcribe-btn {
319
+ padding: 0.625rem 2.5rem;
320
+ font-size: 0.9375rem;
321
+ font-weight: 500;
322
+ background: #2563eb;
323
+ color: white;
324
+ border: none;
325
+ border-radius: 0.5rem;
326
+ cursor: pointer;
327
+ transition: background 0.2s;
328
  }
329
 
330
+ .transcribe-btn:hover:not(:disabled) {
331
+ background: #1d4ed8;
332
+ }
333
+
334
+ .transcribe-btn:disabled {
335
+ opacity: 0.5;
336
+ cursor: not-allowed;
337
+ }
338
+
339
+ .checkbox-row {
340
+ display: flex;
341
+ align-items: center;
342
+ gap: 0.5rem;
343
+ font-size: 0.875rem;
344
+ color: #64748b;
345
+ cursor: pointer;
346
+ }
347
+
348
+ .checkbox-row input[type="checkbox"] {
349
+ width: 1rem;
350
+ height: 1rem;
351
+ cursor: pointer;
352
+ accent-color: #2563eb;
353
+ }
354
+
355
+ /* Model Progress */
356
+ .model-progress {
357
+ font-size: 0.875rem;
358
+ color: #64748b;
359
+ }
360
+
361
+ /* Transcript Card */
362
+ .transcript-card {
363
+ width: 100%;
364
+ background: white;
365
+ border-radius: 0.5rem;
366
+ box-shadow: 0 25px 50px -12px rgba(0, 0, 0, 0.05);
367
+ border: 1px solid rgba(51, 65, 85, 0.1);
368
+ overflow: hidden;
369
+ }
370
+
371
+ .transcript-header {
372
+ display: flex;
373
+ justify-content: space-between;
374
+ align-items: center;
375
+ padding: 0.75rem 1rem;
376
+ border-bottom: 1px solid #e2e8f0;
377
+ font-size: 0.875rem;
378
+ font-weight: 500;
379
+ color: #64748b;
380
+ }
381
+
382
+ .transcript-actions {
383
+ display: flex;
384
+ gap: 0.25rem;
385
+ }
386
+
387
+ .icon-btn {
388
+ display: flex;
389
+ align-items: center;
390
+ justify-content: center;
391
+ width: 32px;
392
+ height: 32px;
393
+ background: transparent;
394
+ border: none;
395
+ border-radius: 0.375rem;
396
+ color: #64748b;
397
+ cursor: pointer;
398
+ transition: all 0.2s;
399
+ }
400
+
401
+ .icon-btn:hover {
402
+ background: #f1f5f9;
403
+ color: #0f172a;
404
+ }
405
+
406
+ .icon-btn svg {
407
+ width: 18px;
408
+ height: 18px;
409
+ }
410
+
411
+ .transcript-output {
412
+ padding: 1rem;
413
+ min-height: 100px;
414
+ max-height: 300px;
415
+ overflow-y: auto;
416
+ font-size: 0.9375rem;
417
+ line-height: 1.7;
418
+ color: #0f172a;
419
+ word-wrap: break-word;
420
+ }
421
+
422
+ .transcript-row {
423
+ display: flex;
424
+ gap: 0.75rem;
425
+ padding: 0.25rem 0;
426
+ }
427
+
428
+ .transcript-row .timestamp {
429
+ font-size: 0.9375rem;
430
+ color: #94a3b8;
431
+ flex-shrink: 0;
432
+ min-width: 3rem;
433
+ text-align: right;
434
+ font-variant-numeric: tabular-nums;
435
+ }
436
+
437
+ .transcript-row .transcript-text {
438
+ flex: 1;
439
+ }
440
+
441
+ /* Footer */
442
+ .footer {
443
+ text-align: center;
444
+ font-size: 0.875rem;
445
+ color: #64748b;
446
+ }
447
+
448
+ .footer a {
449
+ color: #2563eb;
450
+ text-decoration: none;
451
+ }
452
+
453
+ .footer a:hover {
454
+ text-decoration: underline;
455
+ }
456
+
457
+ .gpu-info {
458
+ font-size: 0.75rem;
459
+ font-family: 'SF Mono', Monaco, 'Courier New', monospace;
460
+ color: #94a3b8;
461
+ }
462
+
463
+ /* Responsive */
464
+ @media (max-width: 640px) {
465
+ body {
466
+ padding: 1rem;
467
+ }
468
+
469
+ h1 {
470
+ font-size: 2.25rem;
471
+ }
472
+
473
+ h2 {
474
+ font-size: 1rem;
475
+ }
476
+
477
+ .input-tile {
478
+ padding: 0.5rem;
479
+ font-size: 0.8125rem;
480
+ }
481
+
482
+ .input-tile svg {
483
+ width: 1.5rem;
484
+ height: 1.5rem;
485
+ }
486
+ }
487
+
488
+ /* Scrollbar */
489
+ .transcript-output::-webkit-scrollbar {
490
+ width: 6px;
491
+ }
492
+
493
+ .transcript-output::-webkit-scrollbar-track {
494
+ background: #f1f5f9;
495
+ }
496
+
497
+ .transcript-output::-webkit-scrollbar-thumb {
498
+ background: #cbd5e1;
499
+ border-radius: 3px;
500
+ }
501
+
502
+ .transcript-output::-webkit-scrollbar-thumb:hover {
503
+ background: #94a3b8;
504
+ }
505
+
506
+ /* Dark Mode */
507
+ @media (prefers-color-scheme: dark) {
508
+ body {
509
+ background: #0f172a;
510
+ color: #e2e8f0;
511
+ }
512
+
513
+ h1, h2 {
514
+ color: #f1f5f9;
515
+ }
516
+
517
+ .input-card,
518
+ .transcript-card,
519
+ .audio-player {
520
+ background: #1e293b;
521
+ border-color: rgba(148, 163, 184, 0.1);
522
+ box-shadow: 0 25px 50px -12px rgba(0, 0, 0, 0.3);
523
+ }
524
+
525
+ .audio-time {
526
+ color: #94a3b8;
527
+ }
528
+
529
+ .input-card.drag-over {
530
+ background: #1e3a5f;
531
+ border-color: #3b82f6;
532
+ }
533
+
534
+ .input-tile {
535
+ color: #94a3b8;
536
+ }
537
+
538
+ .input-tile:hover:not(:disabled) {
539
+ color: #818cf8;
540
+ background: #312e81;
541
+ }
542
+
543
+ .input-tile.recording {
544
+ color: #f87171;
545
+ background: #450a0a;
546
+ }
547
+
548
+ .input-tile.recording:hover {
549
+ color: #fca5a5;
550
+ background: #7f1d1d;
551
+ }
552
+
553
+ .divider {
554
+ background: #334155;
555
+ }
556
+
557
+ .progress-bar {
558
+ background: #334155;
559
+ }
560
+
561
+ .status-section,
562
+ .task-row,
563
+ .checkbox-row,
564
+ .model-progress {
565
+ color: #94a3b8;
566
+ }
567
+
568
+ .task-row select {
569
+ background: #1e293b;
570
+ color: #e2e8f0;
571
+ border-color: #475569;
572
+ }
573
+
574
+ .task-row select:focus {
575
+ border-color: #3b82f6;
576
+ box-shadow: 0 0 0 2px rgba(59, 130, 246, 0.3);
577
+ }
578
+
579
+ .transcript-header {
580
+ border-color: #334155;
581
+ color: #94a3b8;
582
+ }
583
+
584
+ .transcript-output {
585
+ color: #e2e8f0;
586
+ }
587
+
588
+ .transcript-row .timestamp {
589
+ color: #64748b;
590
+ }
591
+
592
+ .icon-btn {
593
+ color: #94a3b8;
594
+ }
595
+
596
+ .icon-btn:hover {
597
+ background: #334155;
598
+ color: #f1f5f9;
599
+ }
600
+
601
+ .clear-btn {
602
+ color: #64748b;
603
+ }
604
+
605
+ .clear-btn:hover {
606
+ color: #f87171;
607
+ background: #450a0a;
608
+ }
609
+
610
+ .footer {
611
+ color: #64748b;
612
+ }
613
+
614
+ .footer a {
615
+ color: #60a5fa;
616
+ }
617
+
618
+ .privacy-note {
619
+ color: #64748b;
620
+ }
621
+
622
+ .gpu-info {
623
+ color: #64748b;
624
+ }
625
+
626
+ /* Scrollbar dark */
627
+ .transcript-output::-webkit-scrollbar-track {
628
+ background: #1e293b;
629
+ }
630
+
631
+ .transcript-output::-webkit-scrollbar-thumb {
632
+ background: #475569;
633
+ }
634
+
635
+ .transcript-output::-webkit-scrollbar-thumb:hover {
636
+ background: #64748b;
637
+ }
638
  }
vad.js ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Silero VAD for speech detection and silence trimming
3
+ * Based on the approach in ibm-granite/granite-speech HF demo
4
+ */
5
+
6
+ let vadSession = null;
7
+ const VAD_SAMPLE_RATE = 16000;
8
+ const VAD_CHUNK_SIZE = 512; // 32ms chunks at 16kHz
9
+
10
+ // Load VAD model
11
+ async function loadVAD() {
12
+ if (vadSession) return;
13
+
14
+ console.log('Loading VAD model...');
15
+ vadSession = await ort.InferenceSession.create('./silero_vad.onnx', {
16
+ executionProviders: ['wasm'],
17
+ });
18
+ console.log('VAD model loaded');
19
+ }
20
+
21
+ // Get speech timestamps using Silero VAD
22
+ // Returns list of {start, end} in samples
23
+ async function getSpeechTimestamps(audioData, threshold = 0.5) {
24
+ await loadVAD();
25
+
26
+ // Initialize state [2, 1, 128]
27
+ let state = new Float32Array(2 * 1 * 128);
28
+ const sr = BigInt(VAD_SAMPLE_RATE);
29
+
30
+ const speechProbs = [];
31
+
32
+ // Process in chunks
33
+ for (let i = 0; i < audioData.length; i += VAD_CHUNK_SIZE) {
34
+ const chunkEnd = Math.min(i + VAD_CHUNK_SIZE, audioData.length);
35
+ let chunk = new Float32Array(VAD_CHUNK_SIZE);
36
+
37
+ // Copy chunk data
38
+ for (let j = 0; j < chunkEnd - i; j++) {
39
+ chunk[j] = audioData[i + j];
40
+ }
41
+
42
+ // Run VAD
43
+ const inputTensor = new ort.Tensor('float32', chunk, [1, VAD_CHUNK_SIZE]);
44
+ const stateTensor = new ort.Tensor('float32', state, [2, 1, 128]);
45
+ const srTensor = new ort.Tensor('int64', BigInt64Array.from([sr]), []);
46
+
47
+ const outputs = await vadSession.run({
48
+ input: inputTensor,
49
+ state: stateTensor,
50
+ sr: srTensor
51
+ });
52
+
53
+ speechProbs.push(outputs.output.data[0]);
54
+ state = new Float32Array(outputs.stateN.data);
55
+ }
56
+
57
+ // Find speech segments
58
+ const segments = [];
59
+ let inSpeech = false;
60
+ let speechStart = 0;
61
+
62
+ for (let i = 0; i < speechProbs.length; i++) {
63
+ const isSpeech = speechProbs[i] >= threshold;
64
+
65
+ if (isSpeech && !inSpeech) {
66
+ speechStart = i * VAD_CHUNK_SIZE;
67
+ inSpeech = true;
68
+ } else if (!isSpeech && inSpeech) {
69
+ segments.push({
70
+ start: speechStart,
71
+ end: i * VAD_CHUNK_SIZE
72
+ });
73
+ inSpeech = false;
74
+ }
75
+ }
76
+
77
+ if (inSpeech) {
78
+ segments.push({
79
+ start: speechStart,
80
+ end: audioData.length
81
+ });
82
+ }
83
+
84
+ return segments;
85
+ }
86
+
87
+ // Get speech segments with merging (like granite-speech demo)
88
+ // Returns segments with start/end in seconds
89
+ async function getSpeechSegments(audioData, sampleRate = VAD_SAMPLE_RATE) {
90
+ const vadSegments = await getSpeechTimestamps(audioData);
91
+
92
+ if (vadSegments.length === 0) {
93
+ return [{ start: 0, end: audioData.length / sampleRate }];
94
+ }
95
+
96
+ // Convert to seconds and apply buffering/merging
97
+ const startBuffer = 0.3; // seconds - pad segment start
98
+ const minGap = 0.5; // seconds - merge segments with small gaps
99
+
100
+ const segments = [];
101
+
102
+ for (const seg of vadSegments) {
103
+ const startSec = seg.start / sampleRate;
104
+ const endSec = seg.end / sampleRate;
105
+ const bufferedStart = Math.max(0, startSec - startBuffer);
106
+
107
+ if (segments.length > 0 && bufferedStart - segments[segments.length - 1].end < minGap) {
108
+ // Merge with previous segment
109
+ segments[segments.length - 1].end = endSec;
110
+ } else {
111
+ // Start new segment
112
+ if (segments.length > 0) {
113
+ // Extend to previous end to avoid gaps
114
+ segments.push({ start: segments[segments.length - 1].end, end: endSec });
115
+ } else {
116
+ segments.push({ start: bufferedStart, end: endSec });
117
+ }
118
+ }
119
+ }
120
+
121
+ return segments;
122
+ }
123
+
124
+ // Trim silence from audio (simple version - just trim start/end)
125
+ async function trimSilence(audioData, sampleRate = VAD_SAMPLE_RATE) {
126
+ const segments = await getSpeechTimestamps(audioData);
127
+
128
+ if (segments.length === 0) {
129
+ console.log('VAD: No speech detected, returning original audio');
130
+ return audioData;
131
+ }
132
+
133
+ // Add padding (300ms)
134
+ const paddingSamples = Math.floor(0.3 * sampleRate);
135
+
136
+ const start = Math.max(0, segments[0].start - paddingSamples);
137
+ const end = Math.min(audioData.length, segments[segments.length - 1].end + paddingSamples);
138
+
139
+ const trimmedStart = (start / sampleRate).toFixed(2);
140
+ const trimmedEnd = ((audioData.length - end) / sampleRate).toFixed(2);
141
+ console.log(`VAD: Trimmed ${trimmedStart}s from start, ${trimmedEnd}s from end`);
142
+
143
+ return audioData.slice(start, end);
144
+ }
145
+
146
+ // Format timestamp as MM:SS
147
+ function formatTimestamp(seconds) {
148
+ const mins = Math.floor(seconds / 60);
149
+ const secs = Math.floor(seconds % 60);
150
+ return `${mins}:${secs.toString().padStart(2, '0')}`;
151
+ }
152
+
153
+ // Export
154
+ window.loadVAD = loadVAD;
155
+ window.trimSilence = trimSilence;
156
+ window.getSpeechSegments = getSpeechSegments;
157
+ window.formatTimestamp = formatTimestamp;