Spaces:
Running
Running
| import { | |
| AutoProcessor, | |
| AutoModelForImageTextToText, | |
| RawImage, | |
| TextStreamer, | |
| } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.7.2'; | |
| let processor = null; | |
| let model = null; | |
| let currentVideo = null; | |
| let frameDescriptions = []; | |
| // Check WebGPU support | |
| async function checkWebGPUSupport() { | |
| const statusEl = document.getElementById('webgpuStatus'); | |
| if ('gpu' in navigator) { | |
| statusEl.textContent = '✅ Available'; | |
| statusEl.style.color = '#10b981'; | |
| return true; | |
| } else { | |
| statusEl.textContent = '❌ Not Available'; | |
| statusEl.style.color = '#ef4444'; | |
| document.getElementById('deviceSelect').value = 'wasm'; | |
| document.getElementById('deviceSelect').disabled = true; | |
| return false; | |
| } | |
| } | |
| // Initialize the model | |
| async function initializeModel() { | |
| const device = document.getElementById('deviceSelect').value; | |
| updateStatus('Loading AI model...'); | |
| try { | |
| const model_id = "onnx-community/FastVLM-0.5B-ONNX"; | |
| processor = await AutoProcessor.from_pretrained(model_id); | |
| const modelOptions = { | |
| dtype: { | |
| embed_tokens: "fp16", | |
| vision_encoder: "q4", | |
| decoder_model_merged: "q4", | |
| } | |
| }; | |
| if (device === 'webgpu') { | |
| modelOptions.device = 'webgpu'; | |
| } | |
| model = await AutoModelForImageTextToText.from_pretrained(model_id, modelOptions); | |
| updateStatus('Model loaded successfully!'); | |
| return true; | |
| } catch (error) { | |
| console.error('Model initialization error:', error); | |
| showError('Failed to load AI model. Please try again.'); | |
| return false; | |
| } | |
| } | |
| // Extract frames from video | |
| async function extractFramesFromVideo(videoFile, numFrames = 4) { | |
| return new Promise((resolve, reject) => { | |
| const video = document.createElement('video'); | |
| const canvas = document.createElement('canvas'); | |
| const ctx = canvas.getContext('2d'); | |
| video.src = URL.createObjectURL(videoFile); | |
| video.addEventListener('loadedmetadata', async () => { | |
| const duration = video.duration; | |
| const frameInterval = duration / numFrames; | |
| const frames = []; | |
| const frameTimes = []; | |
| canvas.width = Math.min(video.videoWidth, 1024); | |
| canvas.height = Math.min(video.videoHeight, 1024); | |
| for (let i = 0; i < numFrames; i++) { | |
| const currentTime = i * frameInterval; | |
| video.currentTime = currentTime; | |
| frameTimes.push(currentTime); | |
| await new Promise(r => { | |
| video.addEventListener('seeked', () => r(), { once: true }); | |
| }); | |
| ctx.drawImage(video, 0, 0, canvas.width, canvas.height); | |
| const blob = await new Promise(r => canvas.toBlob(r, 'image/png')); | |
| const image = await RawImage.fromBlob(blob); | |
| // Store frame preview | |
| const previewUrl = canvas.toDataURL('image/jpeg', 0.8); | |
| frames.push({ image, preview: previewUrl, time: currentTime }); | |
| updateProgress((i + 1) / numFrames * 30, `Extracting frame ${i + 1}/${numFrames}`); | |
| } | |
| URL.revokeObjectURL(video.src); | |
| resolve(frames); | |
| }); | |
| video.addEventListener('error', () => { | |
| URL.revokeObjectURL(video.src); | |
| reject(new Error('Failed to load video')); | |
| }); | |
| }); | |
| } | |
| // Process video | |
| async function processVideo() { | |
| const analyzeBtn = document.getElementById('analyzeBtn'); | |
| const progressSection = document.getElementById('progressSection'); | |
| const resultsSection = document.getElementById('resultsSection'); | |
| analyzeBtn.disabled = true; | |
| analyzeBtn.querySelector('.spinner').classList.remove('hidden'); | |
| analyzeBtn.querySelector('.btn-text').textContent = 'Processing...'; | |
| progressSection.classList.remove('hidden'); | |
| resultsSection.classList.add('hidden'); | |
| frameDescriptions = []; | |
| try { | |
| // Initialize model if not already loaded | |
| if (!model || !processor) { | |
| if (!await initializeModel()) { | |
| throw new Error('Model initialization failed'); | |
| } | |
| } | |
| // Extract frames | |
| const numFrames = parseInt(document.getElementById('frameCount').value); | |
| updateProgress(0, 'Extracting frames from video...'); | |
| const frames = await extractFramesFromVideo(currentVideo, numFrames); | |
| // Display frame previews | |
| const framesGrid = document.getElementById('framesGrid'); | |
| framesGrid.innerHTML = ''; | |
| // Process each frame | |
| for (let i = 0; i < frames.length; i++) { | |
| updateProgress(30 + (i / frames.length * 50), `Analyzing frame ${i + 1}/${frames.length}`); | |
| // Create frame card | |
| const frameCard = document.createElement('div'); | |
| frameCard.className = 'frame-card'; | |
| frameCard.innerHTML = ` | |
| <img src="${frames[i].preview}" alt="Frame ${i + 1}"> | |
| <div class="frame-info"> | |
| <h4>Frame ${i + 1}</h4> | |
| <span class="frame-time">${formatTime(frames[i].time)}</span> | |
| </div> | |
| <div class="frame-description"> | |
| <div class="loading-dots">Analyzing...</div> | |
| </div> | |
| `; | |
| framesGrid.appendChild(frameCard); | |
| // Prepare prompt | |
| const messages = [ | |
| { | |
| role: "user", | |
| content: `<image>Describe what's happening in this frame of the video in detail.`, | |
| }, | |
| ]; | |
| const prompt = processor.apply_chat_template(messages, { | |
| add_generation_prompt: true, | |
| }); | |
| // Prepare inputs | |
| const inputs = await processor(frames[i].image, prompt, { | |
| add_special_tokens: false, | |
| }); | |
| // Generate output | |
| let generatedText = ''; | |
| try { | |
| const outputs = await model.generate({ | |
| ...inputs, | |
| max_new_tokens: 256, | |
| do_sample: false, | |
| streamer: new TextStreamer(processor.tokenizer, { | |
| skip_prompt: true, | |
| skip_special_tokens: false, | |
| callback_function: (text) => { | |
| generatedText += text; | |
| frameCard.querySelector('.frame-description').innerHTML = `<p>${generatedText}</p>`; | |
| }, | |
| }), | |
| }); | |
| // Decode output | |
| const decoded = processor.batch_decode( | |
| outputs.slice(null, [inputs.input_ids.dims.at(-1), null]), | |
| { skip_special_tokens: true }, | |
| ); | |
| frameDescriptions.push({ | |
| frame: i + 1, | |
| time: frames[i].time, | |
| description: decoded[0] || generatedText | |
| }); | |
| } catch (frameError) { | |
| console.error(`Error processing frame ${i + 1}:`, frameError); | |
| frameDescriptions.push({ | |
| frame: i + 1, | |
| time: frames[i].time, | |
| description: 'Failed to analyze this frame' | |
| }); | |
| frameCard.querySelector('.frame-description').innerHTML = `<p style="color: #ef4444;">Failed to analyze this frame</p>`; | |
| } | |
| } | |
| // Generate overall summary | |
| updateProgress(80, 'Generating video summary...'); | |
| const summaryCard = document.getElementById('summaryCard'); | |
| const summaryContent = document.getElementById('summaryContent'); | |
| // Create a summary based on the frame descriptions | |
| if (frameDescriptions.length > 0) { | |
| const summaryMessages = [ | |
| { | |
| role: "user", | |
| content: `<image>Based on what you see in this video frame and knowing that the video contains the following sequence: ${frameDescriptions.map(f => `Frame ${f.frame}: ${f.description}`).join('; ')}. Provide a comprehensive summary of what the entire video is about.`, | |
| }, | |
| ]; | |
| const summaryPrompt = processor.apply_chat_template(summaryMessages, { | |
| add_generation_prompt: true, | |
| }); | |
| // Use the last frame's image for context | |
| const summaryInputs = await processor(frames[frames.length - 1].image, summaryPrompt, { | |
| add_special_tokens: false, | |
| }); | |
| let summaryText = ''; | |
| const summaryOutputs = await model.generate({ | |
| ...summaryInputs, | |
| max_new_tokens: 512, | |
| do_sample: false, | |
| streamer: new TextStreamer(processor.tokenizer, { | |
| skip_prompt: true, | |
| skip_special_tokens: false, | |
| callback_function: (text) => { | |
| summaryText += text; | |
| summaryContent.innerHTML = `<p>${summaryText}</p>`; | |
| summaryCard.classList.remove('hidden'); | |
| }, | |
| }), | |
| }); | |
| } | |
| updateProgress(100, 'Analysis complete!'); | |
| // Show results | |
| resultsSection.classList.remove('hidden'); | |
| progressSection.classList.add('hidden'); | |
| } catch (error) { | |
| console.error('Processing error:', error); | |
| showError(`Failed to process video: ${error.message}`); | |
| } finally { | |
| analyzeBtn.disabled = false; | |
| analyzeBtn.querySelector('.spinner').classList.add('hidden'); | |
| analyzeBtn.querySelector('.btn-text').textContent = 'Analyze Video'; | |
| } | |
| } | |
| // Utility functions | |
| function formatTime(seconds) { | |
| const mins = Math.floor(seconds / 60); | |
| const secs = Math.floor(seconds % 60); | |
| return `${mins}:${secs.toString().padStart(2, '0')}`; | |
| } | |
| function updateProgress(percent, status) { | |
| document.getElementById('progressFill').style.width = `${percent}%`; | |
| document.getElementById('progressText').textContent = `${Math.round(percent)}%`; | |
| document.getElementById('currentStatus').textContent = status; | |
| } | |
| function updateStatus(message) { | |
| document.getElementById('currentStatus').textContent = message; | |
| } | |
| function showError(message) { | |
| document.getElementById('errorMessage').textContent = message; | |
| document.getElementById('errorSection').classList.remove('hidden'); | |
| document.getElementById('progressSection').classList.add('hidden'); | |
| } | |
| function downloadResults() { | |
| const results = { | |
| timestamp: new Date().toISOString(), | |
| video: currentVideo.name, | |
| frames: frameDescriptions, | |
| summary: document.getElementById('summaryContent').textContent | |
| }; | |
| const blob = new Blob([JSON.stringify(results, null, 2)], { type: 'application/json' }); | |
| const url = URL.createObjectURL(blob); | |
| const a = document.createElement('a'); | |
| a.href = url; | |
| a.download = `video-analysis-${Date.now()}.json`; | |
| a.click(); | |
| URL.revokeObjectURL(url); | |
| } | |
| function resetApp() { | |
| document.getElementById('videoInput').value = ''; | |
| document.getElementById('videoInfo').innerHTML = ''; | |
| document.getElementById('videoPreview').classList.add('hidden'); | |
| document.getElementById('analyzeBtn').classList.add('hidden'); | |
| document.getElementById('progressSection').classList.add('hidden'); | |
| document.getElementById('resultsSection').classList.add('hidden'); | |
| document.getElementById('errorSection').classList.add('hidden'); | |
| currentVideo = null; | |
| frameDescriptions = []; | |
| } | |
| // Event listeners | |
| document.getElementById('videoInput').addEventListener('change', (e) => { | |
| const file = e.target.files[0]; | |
| if (file && file.type.startsWith('video/')) { | |
| currentVideo = file; | |
| // Display video info | |
| const videoInfo = document.getElementById('videoInfo'); | |
| videoInfo.innerHTML = ` | |
| <div class="file-info"> | |
| <span class="file-name">${file.name}</span> | |
| <span class="file-size">${(file.size / 1024 / 1024).toFixed(2)} MB</span> | |
| </div> | |
| `; | |
| // Show video preview | |
| const videoPreview = document.getElementById('videoPreview'); | |
| const videoElement = document.getElementById('videoElement'); | |
| videoElement.src = URL.createObjectURL(file); | |
| videoPreview.classList.remove('hidden'); | |
| // Show analyze button | |
| document.getElementById('analyzeBtn').classList.remove('hidden'); | |
| document.getElementById('analyzeBtn').disabled = false; | |
| } | |
| }); | |
| document.getElementById('analyzeBtn').addEventListener('click', processVideo); | |
| // Initialize | |
| checkWebGPUSupport(); |