scripts/audio_smoke_data.py

"""
Generate the W1 audio smoke dataset: a handful of 5s sine-wave clips paired
with deterministic transcripts.

Why synthetic instead of real speech: W1 only proves the forward path
(WhisperEncoder -> Projector -> GPT prepend) and that the projector's gradient
flows into a decreasing loss on a tiny fixed set. Real speech adds a network
dependency to a step that should be reproducible offline. W2 swaps in
LibriSpeech.

Audio files land under data/audio_smoke/wavs/ (gitignored). The manifest
data/audio_smoke/manifest.jsonl is the only artifact committed.

Usage:
    python -m scripts.audio_smoke_data
"""

import argparse
import json
import wave
from pathlib import Path

import numpy as np


SAMPLES = [
    (220.0, "low tone"),
    (330.0, "mid low tone"),
    (440.0, "middle tone"),
    (660.0, "mid high tone"),
    (880.0, "high tone"),
]
SR = 16000
DURATION_S = 5.0


def synth_sine(freq_hz, duration_s=DURATION_S, sr=SR):
    """Sine + 2nd harmonic + a sliver of noise so Whisper sees non-degenerate
    frames (a pure tone collapses to a near-constant log-mel)."""
    t = np.arange(int(sr * duration_s)) / sr
    x = 0.5 * np.sin(2 * np.pi * freq_hz * t) + 0.25 * np.sin(2 * np.pi * 2 * freq_hz * t)
    rng = np.random.default_rng(int(freq_hz))
    x = x + 0.01 * rng.standard_normal(len(x))
    return x.astype(np.float32)


def write_wav_pcm16(path, audio, sr=SR):
    """Write mono PCM16 WAV using the stdlib (no scipy/soundfile dependency)."""
    pcm = np.clip(audio, -1.0, 1.0)
    pcm = (pcm * 32767.0).astype(np.int16)
    with wave.open(str(path), "wb") as w:
        w.setnchannels(1)
        w.setsampwidth(2)
        w.setframerate(sr)
        w.writeframes(pcm.tobytes())


def generate_synthetic(data_dir):
    data_dir = Path(data_dir)
    wav_dir = data_dir / "wavs"
    wav_dir.mkdir(parents=True, exist_ok=True)
    manifest_path = data_dir / "manifest.jsonl"
    with open(manifest_path, "w") as f:
        for freq, text in SAMPLES:
            name = f"sine_{int(freq):04d}.wav"
            wav_path = wav_dir / name
            if not wav_path.exists():
                write_wav_pcm16(wav_path, synth_sine(freq))
            f.write(json.dumps({"wav": f"wavs/{name}", "text": text, "sr": SR}) + "\n")
    print(f"Wrote {len(SAMPLES)} samples to {data_dir}")
    return manifest_path


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--data-dir", default="data/audio_smoke")
    args = parser.parse_args()
    generate_synthetic(args.data_dir)
omni: W1 audio align smoke — synthetic dataset + 50-step script End-to-end smoke proving the audio path: wav -> WhisperEncoder (frozen) -> Projector -> prepend to text embeddings -> tiny d6 GPT (random init) -> CE loss on text only Pass criterion is a plain "loss drops by at least 0.5". On a 4090 the run finishes in ~1 s and goes 5.55 -> 0.17 over 50 steps, so the threshold has plenty of headroom against false positives. Two design calls worth keeping in mind: 1. Synthetic sine clips, not LibriSpeech. W1 is forward-path proof, not alignment quality, and a deterministic offline dataset means no network on the smoke path. data/audio_smoke/manifest.jsonl is the only thing committed; wavs are regenerated by audio_smoke_data.py and gitignored. W2 swaps in real LibriSpeech. 2. Standalone byte-level tokenizer (UTF-8 bytes + a single BOS, vocab=257). Avoids depending on a trained nanochat BPE — the d6 GPT is random anyway, so vocab choice doesn't matter for "does the gradient flow" smoke. W2 onwards uses the real BPE on a real base. Caveat documented in doc/todo.md: because the LM is also random and being trained, the loss-down here mostly reflects the LM memorising 5 short strings, not Whisper-Projector alignment. That's fine for proving plumbing; W2 freezes the LM so projector-only gradient is the only path to lower loss. 2026-05-05 22:39:20 +01:00			`"""`
			`Generate the W1 audio smoke dataset: a handful of 5s sine-wave clips paired`
			`with deterministic transcripts.`

			`Why synthetic instead of real speech: W1 only proves the forward path`
			`(WhisperEncoder -> Projector -> GPT prepend) and that the projector's gradient`
			`flows into a decreasing loss on a tiny fixed set. Real speech adds a network`
			`dependency to a step that should be reproducible offline. W2 swaps in`
			`LibriSpeech.`

			`Audio files land under data/audio_smoke/wavs/ (gitignored). The manifest`
			`data/audio_smoke/manifest.jsonl is the only artifact committed.`

			`Usage:`
			`python -m scripts.audio_smoke_data`
			`"""`

			`import argparse`
			`import json`
			`import wave`
			`from pathlib import Path`

			`import numpy as np`


			`SAMPLES = [`
			`(220.0, "low tone"),`
			`(330.0, "mid low tone"),`
			`(440.0, "middle tone"),`
			`(660.0, "mid high tone"),`
			`(880.0, "high tone"),`
			`]`
			`SR = 16000`
			`DURATION_S = 5.0`


			`def synth_sine(freq_hz, duration_s=DURATION_S, sr=SR):`
			`"""Sine + 2nd harmonic + a sliver of noise so Whisper sees non-degenerate`
			`frames (a pure tone collapses to a near-constant log-mel)."""`
			`t = np.arange(int(sr * duration_s)) / sr`
			`x = 0.5 * np.sin(2 * np.pi * freq_hz * t) + 0.25 * np.sin(2 * np.pi * 2 * freq_hz * t)`
			`rng = np.random.default_rng(int(freq_hz))`
			`x = x + 0.01 * rng.standard_normal(len(x))`
			`return x.astype(np.float32)`


			`def write_wav_pcm16(path, audio, sr=SR):`
			`"""Write mono PCM16 WAV using the stdlib (no scipy/soundfile dependency)."""`
			`pcm = np.clip(audio, -1.0, 1.0)`
			`pcm = (pcm * 32767.0).astype(np.int16)`
			`with wave.open(str(path), "wb") as w:`
			`w.setnchannels(1)`
			`w.setsampwidth(2)`
			`w.setframerate(sr)`
			`w.writeframes(pcm.tobytes())`


			`def generate_synthetic(data_dir):`
			`data_dir = Path(data_dir)`
			`wav_dir = data_dir / "wavs"`
			`wav_dir.mkdir(parents=True, exist_ok=True)`
			`manifest_path = data_dir / "manifest.jsonl"`
			`with open(manifest_path, "w") as f:`
			`for freq, text in SAMPLES:`
			`name = f"sine_{int(freq):04d}.wav"`
			`wav_path = wav_dir / name`
			`if not wav_path.exists():`
			`write_wav_pcm16(wav_path, synth_sine(freq))`
			`f.write(json.dumps({"wav": f"wavs/{name}", "text": text, "sr": SR}) + "\n")`
			`print(f"Wrote {len(SAMPLES)} samples to {data_dir}")`
			`return manifest_path`


			`if __name__ == "__main__":`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument("--data-dir", default="data/audio_smoke")`
			`args = parser.parse_args()`
			`generate_synthetic(args.data_dir)`