Multimodal Browser AI with Transformers.js for Pictures and Speech

<title>Multimodal Media Analyzer</title>

* { field–sizing: border–field; margin: 0; padding: 0; }

physique {

font–household: system–ui, sans–serif;

max–width: 820px;

margin: 0 auto;

padding: 1.5rem 1rem;

background: #f1f5f9;

coloration: #1e293b;

}

header { margin–backside: 1.5rem; }

header h1 { font–dimension: 1.5rem; }

header p { coloration: #64748b; font-size: 0.9rem; margin-top: 0.2rem; }

/* Mannequin standing indicators */

.mannequin–standing–bar {

show: flex;

hole: 0.5rem;

flex–wrap: wrap;

margin–high: 0.75rem;

}

.mannequin–badge {

font–dimension: 0.78rem;

padding: 0.2rem 0.6rem;

border–radius: 12px;

background: #fef3c7;

coloration: #92400e;

}

.mannequin–badge.prepared { background: #dcfce7; coloration: #15803d; }

/* Tab bar */

.tabs {

show: flex;

background: white;

border–radius: 8px;

padding: 0.25rem;

hole: 0.25rem;

margin–backside: 1.25rem;

border: 1px strong #e2e8f0;

}

.tab {

flex: 1;

padding: 0.5rem;

textual content–align: heart;

border–radius: 6px;

cursor: pointer;

font–dimension: 0.9rem;

coloration: #64748b;

transition: all 0.15s;

}

.tab.energetic { background: #2563eb; coloration: white; font-weight: 600; }

/* Enter panels */

.panel { show: none; }

.panel.energetic { show: block; }

.add–space {

background: white;

border: 2px dashed #cbd5e1;

border–radius: 8px;

padding: 2rem;

textual content–align: heart;

cursor: pointer;

}

.add–space enter { show: none; }

#img-preview {

margin–high: 1rem;

max–width: 100%;

max–top: 320px;

border–radius: 8px;

show: none;

object–match: cowl;

}

.mic–heart { textual content–align: heart; padding: 1rem 0; }

#rec-btn {

width: 72px; top: 72px;

border–radius: 50%; border: none;

background: #dc2626; coloration: white;

font–dimension: 1.6rem; cursor: pointer;

show: flex; align–objects: heart; justify–content material: heart;

margin: 0 auto 0.5rem;

}

#rec-btn.recording { background: #374151; }

#rec-btn:disabled { background: #94a3b8; cursor: not-allowed; }

#rec-timer { font-weight: 600; coloration: #374151; margin-bottom: 0.25rem; }

#rec-hint { font-size: 0.85rem; coloration: #64748b; }

#wave-canvas { show: block; margin: 0.5rem auto; border-radius: 4px; }

/* Outcomes grid */

.outcomes–grid {

show: grid;

grid–template–columns: repeat(auto–match, minmax(220px, 1fr));

hole: 1rem;

margin–high: 1.25rem;

}

.outcome–card {

background: white;

border: 1px strong #e2e8f0;

border–radius: 8px;

padding: 1rem;

}

.outcome–card h3 {

font–dimension: 0.75rem;

textual content–remodel: uppercase;

letter–spacing: 0.06em;

coloration: #64748b;

margin–backside: 0.6rem;

}

.label–merchandise {

show: flex;

justify–content material: area–between;

align–objects: heart;

padding: 0.25rem 0;

font–dimension: 0.875rem;

border–backside: 1px strong #f1f5f9;

}

.label–rating {

font–dimension: 0.8rem;

coloration: #64748b;

background: #f1f5f9;

padding: 0.1rem 0.4rem;

border–radius: 4px;

}

.caption–physique {

font–dimension: 0.95rem;

line–top: 1.5;

font–model: italic;

coloration: #334155;

}

.transcript–physique {

font–dimension: 0.95rem;

line–top: 1.6;

coloration: #334155;

white–area: pre–wrap;

}

.placeholder–textual content { coloration: #94a3b8; font-style: italic; font-size: 0.9rem; }

#global-status {

font–dimension: 0.85rem;

coloration: #64748b;

margin–backside: 1rem;

}

@media (max–width: 500px) {

.outcomes–grid { grid–template–columns: 1fr; }

}

<h1>Multimodal Media Analyzer</h1>

<p>Picture classification, captioning, and speech transcription — all in your browser.</p>

<span class=“model-badge” id=“badge-cls”>Classifier: loading...</span>

<span class=“model-badge” id=“badge-cap”>Captioner: loading...</span>

<span class=“model-badge” id=“badge-asr”>Whisper: loading...</span>

</div>

</header>

<div id=“global-status”>Loading fashions in parallel — first run downloads ~400 MB complete.</div>

<div class=“tab energetic” information–tab=“picture”>🖼 Picture Evaluation</div>

<div class=“tab” information–tab=“speech”>🎙 Speech Transcription</div>

</div>

<!— Picture panel —>

<p>Click on or drag an picture to analyze</p>

JPG, PNG, WebP, GIF supported

</p>

</div>

</div>

<!— Speech panel —>

<div id=“rec-hint”>Ready for Whisper mannequin...</div>

</div>

<!— Outcomes – proven for each modes —>

<!— Picture outcomes (proven in picture mode) —>

<h3>Classification</h3>

<p class=“placeholder-text”>No outcomes but.</p>

</div>

<h3>Caption</h3>

<p class=“placeholder-text”>No caption but.</p>

</div>

<!— Speech outcomes (proven in speech mode) —>

<h3>Transcription</h3>

<p class=“placeholder-text”>File audio to see the transcription.</p>

</div>

import { pipeline }

from ‘https://cdn.jsdelivr.web/npm/@huggingface/transformers@3.0.2’;

// ── Pipeline references ───────────────────────────────────────────────

let classifier, captioner, transcriber;

let readyCount = 0;

// Replace a mannequin badge to “prepared” state

perform markReady(badgeId, label) {

const badge = doc.getElementById(badgeId);

badge.textContent = `${label}: prepared`;

badge.classList.add(‘prepared’);

readyCount++;

if (readyCount === 3) {

globalStatus.textContent =

‘All fashions prepared. Add a picture or report audio.’;

recBtn.disabled = false;

recHint.textContent = ‘Click on to start out recording.’;

}

// Load all three pipelines concurrently

Promise.all([

pipeline(‘image-classification’, ‘Xenova/vit-base-patch16-224’, {

dtype: ‘q8’,

progress_callback: p => p.status === ‘done’ && markReady(‘badge-cls’, ‘Classifier’)

}),

pipeline(‘image-to-text’, ‘Xenova/vit-gpt2-image-captioning’, {

dtype: ‘q8’,

progress_callback: p => p.status === ‘done’ && markReady(‘badge-cap’, ‘Captioner’)

}),

pipeline(‘automatic-speech-recognition’, ‘Xenova/whisper-tiny.en’, {

dtype: ‘q8’,

progress_callback: p => p.status === ‘done’ && markReady(‘badge-asr’, ‘Whisper’)

})

]).then(([cls, cap, asr]) => {

classifier = cls;

captioner = cap;

transcriber = asr;

}).catch(err => {

globalStatus.textContent = `Error loading fashions: ${err.message}`;

});

// ── UI references ─────────────────────────────────────────────────────

const globalStatus = doc.getElementById(‘global-status’);

const resultsGrid = doc.getElementById(‘results-grid’);

const recBtn = doc.getElementById(‘rec-btn’);

const recHint = doc.getElementById(‘rec-hint’);

const recTimer = doc.getElementById(‘rec-timer’);

const waveCanvas = doc.getElementById(‘wave-canvas’);

const waveCtx = waveCanvas.getContext(‘2nd’);

// ── Picture evaluation ────────────────────────────────────────────────────

async perform analyzeImage(dataUrl) {

if (!classifier || !captioner) {

globalStatus.textContent = ‘Fashions nonetheless loading. Please wait.’;

return;

}

globalStatus.textContent = ‘Working classification and captioning…’;

// Present picture outcome playing cards, cover speech card

doc.getElementById(‘card-cls’).model.show = ‘block’;

doc.getElementById(‘card-cap’).model.show = ‘block’;

doc.getElementById(‘card-asr’).model.show = ‘none’;

resultsGrid.model.show = ‘grid’;

doc.getElementById(‘cls-content’).innerHTML =

‘

Classifying…

‘;

doc.getElementById(‘cap-content’).innerHTML =

‘

Producing caption…

‘;

strive {

// Run classification and captioning in parallel

const [classResults, captionResults] = await Promise.all([

classifier(dataUrl, { top_k: 4 }),

captioner(dataUrl, { max_new_tokens: 60 })

]);

// Render classification labels

doc.getElementById(‘cls-content’).innerHTML =

classResults.map(({ label, rating }) => `

<span>${label}</span>

<span class=“label-score”>${(rating * 100).toFixed(1)}%</span>

</div>`).be a part of(”);

// Render generated caption

doc.getElementById(‘cap-content’).innerHTML =

`<p class=“caption-body”>“${captionResults[0]?.generated_text ?? ‘No caption.’}”</p>`;

globalStatus.textContent = ‘Evaluation full.’;

} catch (err) {

globalStatus.textContent = `Error: ${err.message}`;

}

// File add handler for pictures

const imgDrop = doc.getElementById(‘img-drop’);

const imgInput = doc.getElementById(‘img-input’);

const imgPrev = doc.getElementById(‘img-preview’);

perform handleImageFile(file) {

if (!file?.kind.startsWith(‘picture/’)) return;

const reader = new FileReader();

reader.onload = e => {

imgPrev.src = e.goal.outcome;

imgPrev.model.show = ‘block’;

analyzeImage(e.goal.outcome);

};

reader.readAsDataURL(file);

}

imgDrop.addEventListener(‘click on’, () => imgInput.click on());

imgInput.addEventListener(‘change’, e => handleImageFile(e.goal.information[0]));

imgDrop.addEventListener(‘dragover’, e => e.preventDefault());

imgDrop.addEventListener(‘drop’, e => {

e.preventDefault();

handleImageFile(e.dataTransfer.information[0]);

});

// ── Audio decoding helper ─────────────────────────────────────────────

async perform decodeAudio(arrayBuffer) {

const audioCtx = new AudioContext({ sampleRate: 16000 });

const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);

return audioBuffer.getChannelData(0); // Mono Float32Array at 16kHz

}

// ── Speech transcription ──────────────────────────────────────────────

async perform runTranscription(audioData) {

// Present speech outcome card, cover picture playing cards

doc.getElementById(‘card-cls’).model.show = ‘none’;

doc.getElementById(‘card-cap’).model.show = ‘none’;

doc.getElementById(‘card-asr’).model.show = ‘block’;

resultsGrid.model.show = ‘grid’;

doc.getElementById(‘asr-content’).innerHTML =

‘

Transcribing…

‘;

globalStatus.textContent = ‘Working Whisper transcription…’;

strive {

const outcome = await transcriber(audioData, {

chunk_length_s: 30,

stride_length_s: 5

});

doc.getElementById(‘asr-content’).innerHTML =

`<p class=“transcript-body”>${outcome.textual content.trim()}</p>`;

globalStatus.textContent = ‘Transcription full.’;

} catch (err) {

globalStatus.textContent = `Error: ${err.message}`;

}

// ── Microphone recording ──────────────────────────────────────────────

let mediaRecorder, audioChunks = [], timerInterval, analyserNode, animId;

let secs = 0;

perform drawWave() {

const buf = new Uint8Array(analyserNode.frequencyBinCount);

analyserNode.getByteTimeDomainData(buf);

waveCtx.clearRect(0, 0, waveCanvas.width, waveCanvas.top);

waveCtx.beginPath();

waveCtx.strokeStyle = ‘#2563eb’;

waveCtx.lineWidth = 1.5;

buf.forEach((v, i) => {

const x = (i / buf.size) * waveCanvas.width;

const y = (v / 128.0) * (waveCanvas.top / 2);

i === 0 ? waveCtx.moveTo(x, y) : waveCtx.lineTo(x, y);

});

waveCtx.stroke();

animId = requestAnimationFrame(drawWave);

}

recBtn.addEventListener(‘click on’, async () => {

if (mediaRecorder?.state === ‘recording’) {

mediaRecorder.cease();

recBtn.classList.take away(‘recording’);

recBtn.textContent = ‘🎙’;

clearInterval(timerInterval);

cancelAnimationFrame(animId);

waveCtx.clearRect(0, 0, waveCanvas.width, waveCanvas.top);

recHint.textContent = ‘Processing…’;

} else {

strive {

const stream = await navigator.mediaDevices.getUserMedia({ audio: true });

const actx = new AudioContext();

analyserNode = actx.createAnalyser();

actx.createMediaStreamSource(stream).join(analyserNode);

analyserNode.fftSize = 256;

mediaRecorder = new MediaRecorder(stream);

audioChunks = [];

mediaRecorder.ondataavailable = e => e.information.dimension && audioChunks.push(e.information);

mediaRecorder.onstop = async () => {

const blob = new Blob(audioChunks, { kind: ‘audio/webm’ });

const arrayBuffer = await blob.arrayBuffer();

const audioData = await decodeAudio(arrayBuffer);

stream.getTracks().forEach(t => t.cease());

await runTranscription(audioData);

recHint.textContent = ‘Click on to report once more.’;

};

mediaRecorder.begin();

recBtn.classList.add(‘recording’);

recBtn.textContent = ‘⏹’;

secs = 0;

recTimer.textContent = ‘0:00’;

timerInterval = setInterval(() => {

secs++;

recTimer.textContent =

`${Math.flooring(secs / 60)}:${String(secs % 60).padStart(2, ‘0’)}`;

}, 1000);

recHint.textContent = ‘Recording… click on to cease.’;

drawWave();

} catch (err) {

recHint.textContent = `Mic error: ${err.message}`;

}

});

// ── Tab switching ─────────────────────────────────────────────────────

doc.querySelectorAll(‘.tab’).forEach(tab => {

tab.addEventListener(‘click on’, () => {

doc.querySelectorAll(‘.tab, .panel’).forEach(el =>

el.classList.take away(‘energetic’));

tab.classList.add(‘energetic’);

doc.getElementById(`panel–${tab.dataset.tab}`).classList.add(‘energetic’);

});

Multimodal Browser AI with Transformers.js for Pictures and Speech

Bigger Context Home windows Don’t Repair RAG — So I Constructed a System That Does

Leave a Reply Cancel reply

Popular News

Greatest practices for Amazon SageMaker HyperPod activity governance

How Cursor Really Indexes Your Codebase

Construct a serverless audio summarization resolution with Amazon Bedrock and Whisper

Speed up edge AI improvement with SiMa.ai Edgematic with a seamless AWS integration

Context Engineering — A Complete Fingers-On Tutorial with DSPy

About Us

Category

Recent Posts