nanochat/common.py

"""
Common utilities for nanochat.
"""

import os
import re
import logging
import urllib.request
import torch
import torch.distributed as dist
from filelock import FileLock

class ColoredFormatter(logging.Formatter):
    """Custom formatter that adds colors to log messages."""
    # ANSI color codes
    COLORS = {
        'DEBUG': '\033[36m',    # Cyan
        'INFO': '\033[32m',     # Green
        'WARNING': '\033[33m',  # Yellow
        'ERROR': '\033[31m',    # Red
        'CRITICAL': '\033[35m', # Magenta
    }
    RESET = '\033[0m'
    BOLD = '\033[1m'
    def format(self, record):
        # Add color to the level name
        levelname = record.levelname
        if levelname in self.COLORS:
            record.levelname = f"{self.COLORS[levelname]}{self.BOLD}{levelname}{self.RESET}"
        # Format the message
        message = super().format(record)
        # Add color to specific parts of the message
        if levelname == 'INFO':
            # Highlight numbers and percentages
            message = re.sub(r'(\d+\.?\d*\s*(?:GB|MB|%|docs))', rf'{self.BOLD}\1{self.RESET}', message)
            message = re.sub(r'(Shard \d+)', rf'{self.COLORS["INFO"]}{self.BOLD}\1{self.RESET}', message)
        return message

def setup_default_logging():
    handler = logging.StreamHandler()
    handler.setFormatter(ColoredFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
    logging.basicConfig(
        level=logging.INFO,
        handlers=[handler]
    )

setup_default_logging()
logger = logging.getLogger(__name__)

def get_base_dir():
    # co-locate nanochat intermediates with other cached data in ~/.cache (by default)
    if os.environ.get("NANOCHAT_BASE_DIR"):
        nanochat_dir = os.environ.get("NANOCHAT_BASE_DIR")
    else:
        home_dir = os.path.expanduser("~")
        cache_dir = os.path.join(home_dir, ".cache")
        nanochat_dir = os.path.join(cache_dir, "nanochat")
    os.makedirs(nanochat_dir, exist_ok=True)
    return nanochat_dir

def download_file_with_lock(url, filename, postprocess_fn=None):
    """
    Downloads a file from a URL to a local path in the base directory.
    Uses a lock file to prevent concurrent downloads among multiple ranks.
    """
    base_dir = get_base_dir()
    file_path = os.path.join(base_dir, filename)
    lock_path = file_path + ".lock"

    if os.path.exists(file_path):
        return file_path

    with FileLock(lock_path):
        # Only a single rank can acquire this lock
        # All other ranks block until it is released

        # Recheck after acquiring lock
        if os.path.exists(file_path):
            return file_path

        # Download the content as bytes
        print(f"Downloading {url}...")
        with urllib.request.urlopen(url) as response:
            content = response.read() # bytes

        # Write to local file
        with open(file_path, 'wb') as f:
            f.write(content)
        print(f"Downloaded to {file_path}")

        # Run the postprocess function if provided
        if postprocess_fn is not None:
            postprocess_fn(file_path)

    return file_path

def print0(s="",**kwargs):
    ddp_rank = int(os.environ.get('RANK', 0))
    if ddp_rank == 0:
        print(s, **kwargs)

def print_banner():
    # Cool DOS Rebel font ASCII banner made with https://manytools.org/hacker-tools/ascii-banner/
    banner = """
                                                       █████                █████
                                                      ░░███                ░░███
     ████████    ██████   ████████    ██████   ██████  ░███████    ██████  ███████
    ░░███░░███  ░░░░░███ ░░███░░███  ███░░███ ███░░███ ░███░░███  ░░░░░███░░░███░
     ░███ ░███   ███████  ░███ ░███ ░███ ░███░███ ░░░  ░███ ░███   ███████  ░███
     ░███ ░███  ███░░███  ░███ ░███ ░███ ░███░███  ███ ░███ ░███  ███░░███  ░███ ███
     ████ █████░░████████ ████ █████░░██████ ░░██████  ████ █████░░███████  ░░█████
    ░░░░ ░░░░░  ░░░░░░░░ ░░░░ ░░░░░  ░░░░░░   ░░░░░░  ░░░░ ░░░░░  ░░░░░░░░   ░░░░░
    """
    print0(banner)

def is_ddp_requested() -> bool:
    """
    True if launched by torchrun (env present), even before init.
    Used to decide whether we *should* initialize a PG.
    """
    return all(k in os.environ for k in ("RANK", "LOCAL_RANK", "WORLD_SIZE"))

def is_ddp_initialized() -> bool:
    """
    True if torch.distributed is available and the process group is initialized.
    Used at cleanup to avoid destroying a non-existent PG.
    """
    return dist.is_available() and dist.is_initialized()

def get_dist_info():
    if is_ddp_requested():
        # We rely on torchrun's env to decide if we SHOULD init.
        # (Initialization itself happens in compute init.)
        assert all(var in os.environ for var in ['RANK', 'LOCAL_RANK', 'WORLD_SIZE'])
        ddp_rank = int(os.environ['RANK'])
        ddp_local_rank = int(os.environ['LOCAL_RANK'])
        ddp_world_size = int(os.environ['WORLD_SIZE'])
        return True, ddp_rank, ddp_local_rank, ddp_world_size
    else:
        return False, 0, 0, 1

def autodetect_device_type():
    # prefer to use CUDA if available, otherwise use MPS, otherwise fallback on CPU
    if torch.cuda.is_available():
        device_type = "cuda"
    elif torch.backends.mps.is_available():
        device_type = "mps"
    else:
        device_type = "cpu"
    print0(f"Autodetected device type: {device_type}")
    return device_type

def compute_init(device_type="cuda"): # cuda|cpu|mps
    """Basic initialization that we keep doing over and over, so make common."""

    assert device_type in ["cuda", "mps", "cpu"], "Invalid device type atm"
    if device_type == "cuda":
        assert torch.cuda.is_available(), "Your PyTorch installation is not configured for CUDA but device_type is 'cuda'"
    if device_type == "mps":
        assert torch.backends.mps.is_available(), "Your PyTorch installation is not configured for MPS but device_type is 'mps'"

    # Reproducibility
    # Note that we set the global seeds here, but most of the code uses explicit rng objects.
    # The only place where global rng might be used is nn.Module initialization of the model weights.
    torch.manual_seed(42)
    if device_type == "cuda":
        torch.cuda.manual_seed(42)
    # skipping full reproducibility for now, possibly investigate slowdown later
    # torch.use_deterministic_algorithms(True)

    # Precision
    if device_type == "cuda":
        torch.backends.cuda.matmul.fp32_precision = "tf32" # uses tf32 instead of fp32 for matmuls

    # Distributed setup: Distributed Data Parallel (DDP), optional, and requires CUDA
    is_ddp_requested, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
    if is_ddp_requested and device_type == "cuda":
        device = torch.device("cuda", ddp_local_rank)
        torch.cuda.set_device(device)  # make "cuda" default to this device
        dist.init_process_group(backend="nccl", device_id=device)
        dist.barrier()
    else:
        device = torch.device(device_type) # mps|cpu

    if ddp_rank == 0:
        logger.info(f"Distributed world size: {ddp_world_size}")

    return is_ddp_requested, ddp_rank, ddp_local_rank, ddp_world_size, device

def compute_cleanup():
    """Companion function to compute_init, to clean things up before script exit"""
    if is_ddp_initialized():
        dist.destroy_process_group()

class DummyWandb:
    """Useful if we wish to not use wandb but have all the same signatures"""
    def __init__(self):
        pass
    def log(self, *args, **kwargs):
        pass
    def finish(self):
        pass

# hardcoded BF16 peak flops for NVIDIA A100, H100, H200, B200 GPU and AMD MI250, MI300X, MI325X, MI355X and Intel PVC
# inspired by torchtitan: https://github.com/pytorch/torchtitan/blob/main/torchtitan/tools/utils.py
def get_peak_flops(device_name: str) -> float:
    if "A100" in device_name:
        # data from https://www.nvidia.com/en-us/data-center/a100/
        return 312e12
    elif "H100" in device_name:
        # data from https://www.nvidia.com/en-us/data-center/h100/
        # NOTE: Specifications are one-half lower without sparsity.
        if "NVL" in device_name:
            return 835e12
        elif "PCIe" in device_name:
            return 756e12
        else:  # for H100 SXM and other variants
            return 989e12
    elif "H200" in device_name:
        # data from https://www.nvidia.com/en-us/data-center/h200/
        return 989e12
    elif "B200" in device_name:
        # data from https://nvdam.widen.net/s/wwnsxrhm2w/blackwell-datasheet-3384703
        return 2.25e15
    elif "MI355X" in device_name:
        # MI355X data from https://www.amd.com/en/products/accelerators/instinct/mi350/mi355x.html
        return 2500e12
    elif "MI300X" in device_name or "MI325X" in device_name:
        # MI300X data from https://www.amd.com/en/products/accelerators/instinct/mi300/mi300x.html
        # MI325X data from https://www.amd.com/en/products/accelerators/instinct/mi300/mi325x.html
        return 1300e12
    elif "MI250X" in device_name:
        # data from https://www.amd.com/en/products/accelerators/instinct/mi200/mi250x.html (per GCD)
        return 191.5e12
    elif "Data Center GPU Max 1550" in device_name:
        # Also known as Ponte Vecchio (PVC).
        # data from https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html
        # Dot Product Accumulate Systolic (DPAS):
        # - Freq: 1300MHz
        # - #ops: 512
        # Full EU mode (i.e. 512 max compute units): 340.8 TFLOPS (BF16)
        # Standard EU mode (i.e. 448 max compute units): 298.2 TFLOPS (BF16)
        max_comp_units = torch.xpu.get_device_properties("xpu").max_compute_units
        return 512 * max_comp_units * 1300 * 10**6
    elif "l40s" in device_name:
        # data from: "https://resources.nvidia.com/en-us-l40s/l40s-datasheet-28413"
        return 362e12

    else:  # for other GPU types, assume A100
        logger.warning(f"Peak flops undefined for: {device_name}, fallback to A100")
        return 312e12
initial commit 2025-10-13 06:49:24 -07:00			`"""`
			`Common utilities for nanochat.`
			`"""`

			`import os`
			`import re`
			`import logging`
add the SpellingBee task so that nanochat can count r in strawberry etc. along the way we had to add a bunch of new functionality, e.g. extend the calculator to support the count function of python. possibly the current TaskMixture uses way too many synthetic examples of SpellingBee because the eval gives us exactly 100% performance on spelling. We can tune this later to reclaim some wall clock time here I think 2025-10-24 14:02:48 +00:00			`import urllib.request`
initial commit 2025-10-13 06:49:24 -07:00			`import torch`
			`import torch.distributed as dist`
Replace fcntl with filelock for Windows compatibility 2025-11-04 07:22:34 +00:00			`from filelock import FileLock`
initial commit 2025-10-13 06:49:24 -07:00
			`class ColoredFormatter(logging.Formatter):`
			`"""Custom formatter that adds colors to log messages."""`
			`# ANSI color codes`
			`COLORS = {`
			`'DEBUG': '\033[36m', # Cyan`
			`'INFO': '\033[32m', # Green`
			`'WARNING': '\033[33m', # Yellow`
			`'ERROR': '\033[31m', # Red`
			`'CRITICAL': '\033[35m', # Magenta`
			`}`
			`RESET = '\033[0m'`
			`BOLD = '\033[1m'`
			`def format(self, record):`
			`# Add color to the level name`
			`levelname = record.levelname`
			`if levelname in self.COLORS:`
			`record.levelname = f"{self.COLORS[levelname]}{self.BOLD}{levelname}{self.RESET}"`
			`# Format the message`
			`message = super().format(record)`
			`# Add color to specific parts of the message`
			`if levelname == 'INFO':`
			`# Highlight numbers and percentages`
			`message = re.sub(r'(\d+\.?\d\s(?:GB\|MB\|%\|docs))', rf'{self.BOLD}\1{self.RESET}', message)`
			`message = re.sub(r'(Shard \d+)', rf'{self.COLORS["INFO"]}{self.BOLD}\1{self.RESET}', message)`
			`return message`

			`def setup_default_logging():`
			`handler = logging.StreamHandler()`
			`handler.setFormatter(ColoredFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))`
			`logging.basicConfig(`
			`level=logging.INFO,`
			`handlers=[handler]`
			`)`

			`setup_default_logging()`
			`logger = logging.getLogger(__name__)`

			`def get_base_dir():`
			`# co-locate nanochat intermediates with other cached data in ~/.cache (by default)`
			`if os.environ.get("NANOCHAT_BASE_DIR"):`
			`nanochat_dir = os.environ.get("NANOCHAT_BASE_DIR")`
			`else:`
			`home_dir = os.path.expanduser("~")`
			`cache_dir = os.path.join(home_dir, ".cache")`
			`nanochat_dir = os.path.join(cache_dir, "nanochat")`
			`os.makedirs(nanochat_dir, exist_ok=True)`
			`return nanochat_dir`

move eval bundle download to be lazy and inside the python code so that we can substantially simplify the run bash scripts 2025-11-01 16:04:38 +00:00			`def download_file_with_lock(url, filename, postprocess_fn=None):`
add the SpellingBee task so that nanochat can count r in strawberry etc. along the way we had to add a bunch of new functionality, e.g. extend the calculator to support the count function of python. possibly the current TaskMixture uses way too many synthetic examples of SpellingBee because the eval gives us exactly 100% performance on spelling. We can tune this later to reclaim some wall clock time here I think 2025-10-24 14:02:48 +00:00			`"""`
			`Downloads a file from a URL to a local path in the base directory.`
			`Uses a lock file to prevent concurrent downloads among multiple ranks.`
			`"""`
			`base_dir = get_base_dir()`
			`file_path = os.path.join(base_dir, filename)`
			`lock_path = file_path + ".lock"`

			`if os.path.exists(file_path):`
			`return file_path`

Replace fcntl with filelock for Windows compatibility 2025-11-04 07:22:34 +00:00			`with FileLock(lock_path):`
add the SpellingBee task so that nanochat can count r in strawberry etc. along the way we had to add a bunch of new functionality, e.g. extend the calculator to support the count function of python. possibly the current TaskMixture uses way too many synthetic examples of SpellingBee because the eval gives us exactly 100% performance on spelling. We can tune this later to reclaim some wall clock time here I think 2025-10-24 14:02:48 +00:00			`# Only a single rank can acquire this lock`
			`# All other ranks block until it is released`

Merge branch 'master' into master 2025-11-04 16:35:02 -08:00			`# Recheck after acquiring lock`
add the SpellingBee task so that nanochat can count r in strawberry etc. along the way we had to add a bunch of new functionality, e.g. extend the calculator to support the count function of python. possibly the current TaskMixture uses way too many synthetic examples of SpellingBee because the eval gives us exactly 100% performance on spelling. We can tune this later to reclaim some wall clock time here I think 2025-10-24 14:02:48 +00:00			`if os.path.exists(file_path):`
			`return file_path`

move eval bundle download to be lazy and inside the python code so that we can substantially simplify the run bash scripts 2025-11-01 16:04:38 +00:00			`# Download the content as bytes`
add the SpellingBee task so that nanochat can count r in strawberry etc. along the way we had to add a bunch of new functionality, e.g. extend the calculator to support the count function of python. possibly the current TaskMixture uses way too many synthetic examples of SpellingBee because the eval gives us exactly 100% performance on spelling. We can tune this later to reclaim some wall clock time here I think 2025-10-24 14:02:48 +00:00			`print(f"Downloading {url}...")`
			`with urllib.request.urlopen(url) as response:`
move eval bundle download to be lazy and inside the python code so that we can substantially simplify the run bash scripts 2025-11-01 16:04:38 +00:00			`content = response.read() # bytes`
add the SpellingBee task so that nanochat can count r in strawberry etc. along the way we had to add a bunch of new functionality, e.g. extend the calculator to support the count function of python. possibly the current TaskMixture uses way too many synthetic examples of SpellingBee because the eval gives us exactly 100% performance on spelling. We can tune this later to reclaim some wall clock time here I think 2025-10-24 14:02:48 +00:00
move eval bundle download to be lazy and inside the python code so that we can substantially simplify the run bash scripts 2025-11-01 16:04:38 +00:00			`# Write to local file`
			`with open(file_path, 'wb') as f:`
add the SpellingBee task so that nanochat can count r in strawberry etc. along the way we had to add a bunch of new functionality, e.g. extend the calculator to support the count function of python. possibly the current TaskMixture uses way too many synthetic examples of SpellingBee because the eval gives us exactly 100% performance on spelling. We can tune this later to reclaim some wall clock time here I think 2025-10-24 14:02:48 +00:00			`f.write(content)`
			`print(f"Downloaded to {file_path}")`

move eval bundle download to be lazy and inside the python code so that we can substantially simplify the run bash scripts 2025-11-01 16:04:38 +00:00			`# Run the postprocess function if provided`
			`if postprocess_fn is not None:`
			`postprocess_fn(file_path)`

add the SpellingBee task so that nanochat can count r in strawberry etc. along the way we had to add a bunch of new functionality, e.g. extend the calculator to support the count function of python. possibly the current TaskMixture uses way too many synthetic examples of SpellingBee because the eval gives us exactly 100% performance on spelling. We can tune this later to reclaim some wall clock time here I think 2025-10-24 14:02:48 +00:00			`return file_path`

initial commit 2025-10-13 06:49:24 -07:00			`def print0(s="",**kwargs):`
			`ddp_rank = int(os.environ.get('RANK', 0))`
			`if ddp_rank == 0:`
			`print(s, **kwargs)`

			`def print_banner():`
			`# Cool DOS Rebel font ASCII banner made with https://manytools.org/hacker-tools/ascii-banner/`
			`banner = """`
Update logo in code as well 2025-10-18 09:31:11 -04:00			`█████ █████`
			`░░███ ░░███`
			`████████ ██████ ████████ ██████ ██████ ░███████ ██████ ███████`
			`░░███░░███ ░░░░░███ ░░███░░███ ███░░███ ███░░███ ░███░░███ ░░░░░███░░░███░`
			`░███ ░███ ███████ ░███ ░███ ░███ ░███░███ ░░░ ░███ ░███ ███████ ░███`
			`░███ ░███ ███░░███ ░███ ░███ ░███ ░███░███ ███ ░███ ░███ ███░░███ ░███ ███`
			`████ █████░░████████ ████ █████░░██████ ░░██████ ████ █████░░███████ ░░█████`
			`░░░░ ░░░░░ ░░░░░░░░ ░░░░ ░░░░░ ░░░░░░ ░░░░░░ ░░░░ ░░░░░ ░░░░░░░░ ░░░░░`
			`"""`
initial commit 2025-10-13 06:49:24 -07:00			`print0(banner)`

fix: safe DDP cleanup (check initialized PG, not just env) (#256) 2025-12-27 23:27:40 -05:00			`def is_ddp_requested() -> bool:`
			`"""`
			`True if launched by torchrun (env present), even before init.`
			`Used to decide whether we should initialize a PG.`
			`"""`
			`return all(k in os.environ for k in ("RANK", "LOCAL_RANK", "WORLD_SIZE"))`

			`def is_ddp_initialized() -> bool:`
			`"""`
			`True if torch.distributed is available and the process group is initialized.`
			`Used at cleanup to avoid destroying a non-existent PG.`
			`"""`
			`return dist.is_available() and dist.is_initialized()`
initial commit 2025-10-13 06:49:24 -07:00
			`def get_dist_info():`
fix: safe DDP cleanup (check initialized PG, not just env) (#256) 2025-12-27 23:27:40 -05:00			`if is_ddp_requested():`
			`# We rely on torchrun's env to decide if we SHOULD init.`
			`# (Initialization itself happens in compute init.)`
initial commit 2025-10-13 06:49:24 -07:00			`assert all(var in os.environ for var in ['RANK', 'LOCAL_RANK', 'WORLD_SIZE'])`
			`ddp_rank = int(os.environ['RANK'])`
			`ddp_local_rank = int(os.environ['LOCAL_RANK'])`
			`ddp_world_size = int(os.environ['WORLD_SIZE'])`
			`return True, ddp_rank, ddp_local_rank, ddp_world_size`
			`else:`
			`return False, 0, 0, 1`

add autodetect of device and related stuff. getting weird warnings/errors still, so wip 2025-10-16 10:26:19 -07:00			`def autodetect_device_type():`
			`# prefer to use CUDA if available, otherwise use MPS, otherwise fallback on CPU`
			`if torch.cuda.is_available():`
			`device_type = "cuda"`
many small tweaks. base, eval, core work now i think 2025-10-16 15:46:18 -07:00			`elif torch.backends.mps.is_available():`
add autodetect of device and related stuff. getting weird warnings/errors still, so wip 2025-10-16 10:26:19 -07:00			`device_type = "mps"`
many small tweaks. base, eval, core work now i think 2025-10-16 15:46:18 -07:00			`else:`
			`device_type = "cpu"`
add autodetect of device and related stuff. getting weird warnings/errors still, so wip 2025-10-16 10:26:19 -07:00			`print0(f"Autodetected device type: {device_type}")`
			`return device_type`

add support for CPU and for MPS. I had to change a few cosmetic things. I also discovered I think a bit of a bug, where I was casting wte to bfloat16 in the wrong place (the model init) instead of in init_weights 2025-10-16 10:04:43 -07:00			`def compute_init(device_type="cuda"): # cuda\|cpu\|mps`
initial commit 2025-10-13 06:49:24 -07:00			`"""Basic initialization that we keep doing over and over, so make common."""`

add support for CPU and for MPS. I had to change a few cosmetic things. I also discovered I think a bit of a bug, where I was casting wte to bfloat16 in the wrong place (the model init) instead of in init_weights 2025-10-16 10:04:43 -07:00			`assert device_type in ["cuda", "mps", "cpu"], "Invalid device type atm"`
			`if device_type == "cuda":`
			`assert torch.cuda.is_available(), "Your PyTorch installation is not configured for CUDA but device_type is 'cuda'"`
			`if device_type == "mps":`
			`assert torch.backends.mps.is_available(), "Your PyTorch installation is not configured for MPS but device_type is 'mps'"`
initial commit 2025-10-13 06:49:24 -07:00
			`# Reproducibility`
big change: add pretraining resumption logic so that checkpoints can now be approximately resumed and training can continue. this is useful for very long runs when you don't want the anxiety of your run crashing for some reason. alternatively, it's a way to recover training in the event of loss spikes. i mean, this should have been there in v0 but it's ok. the resumption is approximate to control complexity and bloat, but it's possible we want to change that in the future. to use, set --save_every to a step interval to write checkpoints with, and then use --resume_from_step to resume optimization from a given step. only base model training (pretraining) supports this atm, but it's ok because midtraining is comparably quite a bit faster. 2025-11-13 15:34:40 +00:00			`# Note that we set the global seeds here, but most of the code uses explicit rng objects.`
			`# The only place where global rng might be used is nn.Module initialization of the model weights.`
initial commit 2025-10-13 06:49:24 -07:00			`torch.manual_seed(42)`
trying to add basic cpu support, will try mps too 2025-10-16 16:14:38 +00:00			`if device_type == "cuda":`
			`torch.cuda.manual_seed(42)`
initial commit 2025-10-13 06:49:24 -07:00			`# skipping full reproducibility for now, possibly investigate slowdown later`
			`# torch.use_deterministic_algorithms(True)`

			`# Precision`
add support for CPU and for MPS. I had to change a few cosmetic things. I also discovered I think a bit of a bug, where I was casting wte to bfloat16 in the wrong place (the model init) instead of in init_weights 2025-10-16 10:04:43 -07:00			`if device_type == "cuda":`
fix tf32 warning for deprecated api use 2025-12-27 22:03:06 +00:00			`torch.backends.cuda.matmul.fp32_precision = "tf32" # uses tf32 instead of fp32 for matmuls`
initial commit 2025-10-13 06:49:24 -07:00
trying to add basic cpu support, will try mps too 2025-10-16 16:14:38 +00:00			`# Distributed setup: Distributed Data Parallel (DDP), optional, and requires CUDA`
fix: safe DDP cleanup (check initialized PG, not just env) (#256) 2025-12-27 23:27:40 -05:00			`is_ddp_requested, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()`
			`if is_ddp_requested and device_type == "cuda":`
initial commit 2025-10-13 06:49:24 -07:00			`device = torch.device("cuda", ddp_local_rank)`
Update logo in code as well 2025-10-18 09:31:11 -04:00			`torch.cuda.set_device(device) # make "cuda" default to this device`
initial commit 2025-10-13 06:49:24 -07:00			`dist.init_process_group(backend="nccl", device_id=device)`
			`dist.barrier()`
			`else:`
fix typo Co-authored-by: Tancrède Lepoint <tlepoint@users.noreply.github.com> 2025-10-17 08:35:41 -07:00			`device = torch.device(device_type) # mps\|cpu`
initial commit 2025-10-13 06:49:24 -07:00
			`if ddp_rank == 0:`
			`logger.info(f"Distributed world size: {ddp_world_size}")`

fix: safe DDP cleanup (check initialized PG, not just env) (#256) 2025-12-27 23:27:40 -05:00			`return is_ddp_requested, ddp_rank, ddp_local_rank, ddp_world_size, device`
initial commit 2025-10-13 06:49:24 -07:00
			`def compute_cleanup():`
			`"""Companion function to compute_init, to clean things up before script exit"""`
fix: safe DDP cleanup (check initialized PG, not just env) (#256) 2025-12-27 23:27:40 -05:00			`if is_ddp_initialized():`
initial commit 2025-10-13 06:49:24 -07:00			`dist.destroy_process_group()`

			`class DummyWandb:`
			`"""Useful if we wish to not use wandb but have all the same signatures"""`
			`def __init__(self):`
			`pass`
			`def log(self, args, *kwargs):`
			`pass`
			`def finish(self):`
			`pass`
add detection of device to report more correct mfu for bf16 2026-01-17 03:16:12 +00:00
			`# hardcoded BF16 peak flops for NVIDIA A100, H100, H200, B200 GPU and AMD MI250, MI300X, MI325X, MI355X and Intel PVC`
			`# inspired by torchtitan: https://github.com/pytorch/torchtitan/blob/main/torchtitan/tools/utils.py`
			`def get_peak_flops(device_name: str) -> float:`
			`if "A100" in device_name:`
			`# data from https://www.nvidia.com/en-us/data-center/a100/`
			`return 312e12`
			`elif "H100" in device_name:`
			`# data from https://www.nvidia.com/en-us/data-center/h100/`
			`# NOTE: Specifications are one-half lower without sparsity.`
			`if "NVL" in device_name:`
			`return 835e12`
			`elif "PCIe" in device_name:`
			`return 756e12`
			`else: # for H100 SXM and other variants`
			`return 989e12`
			`elif "H200" in device_name:`
			`# data from https://www.nvidia.com/en-us/data-center/h200/`
			`return 989e12`
			`elif "B200" in device_name:`
			`# data from https://nvdam.widen.net/s/wwnsxrhm2w/blackwell-datasheet-3384703`
			`return 2.25e15`
			`elif "MI355X" in device_name:`
			`# MI355X data from https://www.amd.com/en/products/accelerators/instinct/mi350/mi355x.html`
			`return 2500e12`
			`elif "MI300X" in device_name or "MI325X" in device_name:`
			`# MI300X data from https://www.amd.com/en/products/accelerators/instinct/mi300/mi300x.html`
			`# MI325X data from https://www.amd.com/en/products/accelerators/instinct/mi300/mi325x.html`
			`return 1300e12`
			`elif "MI250X" in device_name:`
			`# data from https://www.amd.com/en/products/accelerators/instinct/mi200/mi250x.html (per GCD)`
			`return 191.5e12`
			`elif "Data Center GPU Max 1550" in device_name:`
			`# Also known as Ponte Vecchio (PVC).`
			`# data from https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html`
			`# Dot Product Accumulate Systolic (DPAS):`
			`# - Freq: 1300MHz`
			`# - #ops: 512`
			`# Full EU mode (i.e. 512 max compute units): 340.8 TFLOPS (BF16)`
			`# Standard EU mode (i.e. 448 max compute units): 298.2 TFLOPS (BF16)`
			`max_comp_units = torch.xpu.get_device_properties("xpu").max_compute_units`
			`return 512 * max_comp_units * 1300 * 10**6`
			`elif "l40s" in device_name:`
			`# data from: "https://resources.nvidia.com/en-us-l40s/l40s-datasheet-28413"`
			`return 362e12`

			`else: # for other GPU types, assume A100`
			`logger.warning(f"Peak flops undefined for: {device_name}, fallback to A100")`
			`return 312e12`