scripts/base_eval.py

"""
Evaluate the CORE metric for a given model.

Run on a single GPU:
python -m scripts.base_eval

Run with torchrun on e.g. 8 GPUs:
torchrun --nproc_per_node=8 -m scripts.base_eval

The script will print the CORE metric to the console.
"""
import os
import csv
import time
import json
import yaml
import shutil
import random
import zipfile
import tempfile
from contextlib import nullcontext

import torch

from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, autodetect_device_type, download_file_with_lock
from nanochat.tokenizer import HuggingFaceTokenizer
from nanochat.checkpoint_manager import load_model
from nanochat.core_eval import evaluate_task

# -----------------------------------------------------------------------------
# nanochat specific function dealing with I/O etc.

# ~162MB of data needed to evaluate the CORE metric
EVAL_BUNDLE_URL = "https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip"

def place_eval_bundle(file_path):
    # here file_path is the path to the eval_bundle.zip file
    # we need to unzip it and place it in the base directory
    base_dir = get_base_dir()
    eval_bundle_dir = os.path.join(base_dir, "eval_bundle")
    with tempfile.TemporaryDirectory() as tmpdir:
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            zip_ref.extractall(tmpdir)
        extracted_bundle_dir = os.path.join(tmpdir, "eval_bundle")
        shutil.move(extracted_bundle_dir, eval_bundle_dir)
    print0(f"Placed eval_bundle directory at {eval_bundle_dir}")

def evaluate_model(model, tokenizer, device, max_per_task=-1):
    """
    Evaluate a base model on the CORE benchmark.
    - max_per_task: crop the data to this many examples per task for testing (-1 = disable)
    """
    # Load config and task metadata
    base_dir = get_base_dir()
    eval_bundle_dir = os.path.join(base_dir, "eval_bundle")
    # Download the eval bundle to disk (and unzip if needed)
    if not os.path.exists(eval_bundle_dir):
        download_file_with_lock(EVAL_BUNDLE_URL, "eval_bundle.zip", postprocess_fn=place_eval_bundle)
    config_path = os.path.join(eval_bundle_dir, "core.yaml")
    data_base_path = os.path.join(eval_bundle_dir, "eval_data")
    eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv")
    with open(config_path, 'r', encoding='utf-8') as f:
        config = yaml.safe_load(f)
    tasks = config['icl_tasks']

    # Load random baseline values from eval metadata
    random_baselines = {}
    with open(eval_meta_data, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            task_name = row['Eval Task']
            random_baseline = row['Random baseline']
            random_baselines[task_name] = float(random_baseline)

    # Evaluate each task
    results = {}
    centered_results = {}
    for task in tasks:
        start_time = time.time()
        label = task['label']
        task_meta = {
            'task_type': task['icl_task_type'],
            'dataset_uri': task['dataset_uri'],
            'num_fewshot': task['num_fewshot'][0],
            'continuation_delimiter': task.get('continuation_delimiter', ' ')
        }
        print0(f"Evaluating: {label} ({task_meta['num_fewshot']}-shot, type: {task_meta['task_type']})... ", end='')

        # Load data for this task
        data_path = os.path.join(data_base_path, task_meta['dataset_uri'])
        with open(data_path, 'r', encoding='utf-8') as f:
            data = [json.loads(line.strip()) for line in f]

        # shuffle the data because in many cases it appears ordered but we want
        # the ability to only run a subset of the data for debugging purposes etc.
        shuffle_rng = random.Random(1337)
        shuffle_rng.shuffle(data)
        if max_per_task > 0:
            data = data[:max_per_task]

        # run the evaluation for this task
        accuracy = evaluate_task(model, tokenizer, data, device, task_meta)

        results[label] = accuracy
        random_baseline = random_baselines[label]
        centered_result = (accuracy - 0.01 * random_baseline) / (1.0 - 0.01 * random_baseline)
        centered_results[label] = centered_result
        end_time = time.time()
        print0(f"accuracy: {accuracy:.4f} | centered: {centered_result:.4f} | time: {end_time - start_time:.2f}s")

    core_metric = sum(centered_results.values()) / len(centered_results)
    out = {
        "results": results,
        "centered_results": centered_results,
        "core_metric": core_metric
    }
    return out

# -----------------------------------------------------------------------------
# HuggingFace loading utilities and light wrappers for a model

class ModelWrapper:
    """Lightweight wrapper for a HuggingFace model"""
    def __init__(self, model, max_seq_len=None):
        self.model = model
        self.max_seq_len = max_seq_len

    def __call__(self, input_ids):
        outputs = self.model(input_ids)
        logits = outputs.logits
        return logits

def load_hf_model(hf_path: str, device):
    print0(f"Loading model from: {hf_path}")
    # Load the model
    from transformers import AutoModelForCausalLM
    model = AutoModelForCausalLM.from_pretrained(hf_path)
    model.to(device)
    model.eval()
    max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None
    model = ModelWrapper(model, max_seq_len=max_seq_len)
    # Load the tokenizer
    tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path)
    return model, tokenizer

# -----------------------------------------------------------------------------
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path to evaluate')
    parser.add_argument('--max-per-task', type=int, default=-1, help='Max examples per task to evaluate (-1 = disable)')
    parser.add_argument('--model-tag', type=str, default=None, help='optional model tag for the output directory name')
    parser.add_argument('--step', type=str, default=None, help='optional model step for the output directory name')
    args = parser.parse_args()

    # distributed / precision setup
    device_type = autodetect_device_type()
    ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
    autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()

    # Load model and tokenizer from command line or from file system
    if args.hf_path is not None:
        # atm assume that if a path is given, it's a huggingface model path
        hf_path = args.hf_path
        print0(f"Loading huggingface model from: {hf_path}")
        model, tokenizer = load_hf_model(hf_path, device)
        model_name = hf_path # just for logging
        model_slug = hf_path.replace("/", "-") # for the output csv file
    else:
        # load a local model from the file system
        model, tokenizer, meta = load_model("base", device, phase="eval", model_tag=args.model_tag, step=args.step)
        model_name = f"base_model (step {meta['step']})" # just for logging
        model_slug = f"base_model_{meta['step']:06d}" # for the output csv file

    # Evaluate the model
    with autocast_ctx:
        out = evaluate_model(model, tokenizer, device, max_per_task=args.max_per_task)

    # Write out the results to a csv file
    core_metric = None
    centered_results = {}
    if ddp_rank == 0:
        base_dir = get_base_dir()
        output_csv_path = os.path.join(base_dir, "base_eval", f"{model_slug}.csv")
        os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)
        results = out["results"]
        centered_results = out["centered_results"]
        core_metric = out["core_metric"]
        with open(output_csv_path, 'w', encoding='utf-8', newline='') as f:
            f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n")
            for label in results:
                f.write(f"{label:<35}, {results[label]:<10.6f}, {centered_results[label]:<10.6f}\n")
            f.write(f"{'CORE':<35}, {'':<10}, {core_metric:<10.6f}\n")
        # Print the content of the csv file to console too
        print0("="*80)
        print0(f"Model: {model_name}")
        print0("="*80)
        with open(output_csv_path, 'r', encoding='utf-8') as f:
            print0(f.read())

    # Log to report
    from nanochat.report import get_report
    get_report().log(section="Base model evaluation", data=[
        {
            "Model": model_name,
            "CORE metric": core_metric,
        },
        centered_results, # the full table
    ])

    compute_cleanup()

if __name__ == "__main__":
    main()
initial commit 2025-10-13 06:49:24 -07:00			`"""`
delete pandas dep in base_eval use csv instead 2025-11-01 15:28:30 +00:00			`Evaluate the CORE metric for a given model.`
initial commit 2025-10-13 06:49:24 -07:00
			`Run on a single GPU:`
move eval bundle download to be lazy and inside the python code so that we can substantially simplify the run bash scripts 2025-11-01 16:04:38 +00:00			`python -m scripts.base_eval`
initial commit 2025-10-13 06:49:24 -07:00
			`Run with torchrun on e.g. 8 GPUs:`
move eval bundle download to be lazy and inside the python code so that we can substantially simplify the run bash scripts 2025-11-01 16:04:38 +00:00			`torchrun --nproc_per_node=8 -m scripts.base_eval`
initial commit 2025-10-13 06:49:24 -07:00
			`The script will print the CORE metric to the console.`
			`"""`
			`import os`
delete pandas dep in base_eval use csv instead 2025-11-01 15:28:30 +00:00			`import csv`
initial commit 2025-10-13 06:49:24 -07:00			`import time`
			`import json`
			`import yaml`
move eval bundle download to be lazy and inside the python code so that we can substantially simplify the run bash scripts 2025-11-01 16:04:38 +00:00			`import shutil`
			`import random`
			`import zipfile`
			`import tempfile`
many small tweaks. base, eval, core work now i think 2025-10-16 15:46:18 -07:00			`from contextlib import nullcontext`
initial commit 2025-10-13 06:49:24 -07:00
			`import torch`

move eval bundle download to be lazy and inside the python code so that we can substantially simplify the run bash scripts 2025-11-01 16:04:38 +00:00			`from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, autodetect_device_type, download_file_with_lock`
initial commit 2025-10-13 06:49:24 -07:00			`from nanochat.tokenizer import HuggingFaceTokenizer`
			`from nanochat.checkpoint_manager import load_model`
			`from nanochat.core_eval import evaluate_task`

			`# -----------------------------------------------------------------------------`
delete pandas dep in base_eval use csv instead 2025-11-01 15:28:30 +00:00			`# nanochat specific function dealing with I/O etc.`
initial commit 2025-10-13 06:49:24 -07:00
move eval bundle download to be lazy and inside the python code so that we can substantially simplify the run bash scripts 2025-11-01 16:04:38 +00:00			`# ~162MB of data needed to evaluate the CORE metric`
			`EVAL_BUNDLE_URL = "https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip"`

			`def place_eval_bundle(file_path):`
			`# here file_path is the path to the eval_bundle.zip file`
			`# we need to unzip it and place it in the base directory`
			`base_dir = get_base_dir()`
			`eval_bundle_dir = os.path.join(base_dir, "eval_bundle")`
			`with tempfile.TemporaryDirectory() as tmpdir:`
			`with zipfile.ZipFile(file_path, 'r') as zip_ref:`
			`zip_ref.extractall(tmpdir)`
			`extracted_bundle_dir = os.path.join(tmpdir, "eval_bundle")`
			`shutil.move(extracted_bundle_dir, eval_bundle_dir)`
			`print0(f"Placed eval_bundle directory at {eval_bundle_dir}")`

initial commit 2025-10-13 06:49:24 -07:00			`def evaluate_model(model, tokenizer, device, max_per_task=-1):`
			`"""`
			`Evaluate a base model on the CORE benchmark.`
			`- max_per_task: crop the data to this many examples per task for testing (-1 = disable)`
			`"""`
			`# Load config and task metadata`
			`base_dir = get_base_dir()`
			`eval_bundle_dir = os.path.join(base_dir, "eval_bundle")`
move eval bundle download to be lazy and inside the python code so that we can substantially simplify the run bash scripts 2025-11-01 16:04:38 +00:00			`# Download the eval bundle to disk (and unzip if needed)`
			`if not os.path.exists(eval_bundle_dir):`
			`download_file_with_lock(EVAL_BUNDLE_URL, "eval_bundle.zip", postprocess_fn=place_eval_bundle)`
initial commit 2025-10-13 06:49:24 -07:00			`config_path = os.path.join(eval_bundle_dir, "core.yaml")`
			`data_base_path = os.path.join(eval_bundle_dir, "eval_data")`
			`eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv")`
add explicit UTF-8 encoding 2025-11-03 21:27:12 +01:00			`with open(config_path, 'r', encoding='utf-8') as f:`
initial commit 2025-10-13 06:49:24 -07:00			`config = yaml.safe_load(f)`
			`tasks = config['icl_tasks']`
delete pandas dep in base_eval use csv instead 2025-11-01 15:28:30 +00:00
			`# Load random baseline values from eval metadata`
			`random_baselines = {}`
			`with open(eval_meta_data, 'r', encoding='utf-8') as f:`
			`reader = csv.DictReader(f)`
			`for row in reader:`
			`task_name = row['Eval Task']`
			`random_baseline = row['Random baseline']`
			`random_baselines[task_name] = float(random_baseline)`
initial commit 2025-10-13 06:49:24 -07:00
			`# Evaluate each task`
			`results = {}`
			`centered_results = {}`
			`for task in tasks:`
			`start_time = time.time()`
			`label = task['label']`
			`task_meta = {`
			`'task_type': task['icl_task_type'],`
			`'dataset_uri': task['dataset_uri'],`
			`'num_fewshot': task['num_fewshot'][0],`
			`'continuation_delimiter': task.get('continuation_delimiter', ' ')`
			`}`
			`print0(f"Evaluating: {label} ({task_meta['num_fewshot']}-shot, type: {task_meta['task_type']})... ", end='')`

			`# Load data for this task`
			`data_path = os.path.join(data_base_path, task_meta['dataset_uri'])`
fix: open JSONL and results CSV with UTF-8 encoding for portability 2025-11-03 01:20:56 -05:00			`with open(data_path, 'r', encoding='utf-8') as f:`
initial commit 2025-10-13 06:49:24 -07:00			`data = [json.loads(line.strip()) for line in f]`

			`# shuffle the data because in many cases it appears ordered but we want`
typo fixes in scripts 2025-10-28 20:17:31 +01:00			`# the ability to only run a subset of the data for debugging purposes etc.`
initial commit 2025-10-13 06:49:24 -07:00			`shuffle_rng = random.Random(1337)`
			`shuffle_rng.shuffle(data)`
			`if max_per_task > 0:`
			`data = data[:max_per_task]`

			`# run the evaluation for this task`
			`accuracy = evaluate_task(model, tokenizer, data, device, task_meta)`

			`results[label] = accuracy`
delete pandas dep in base_eval use csv instead 2025-11-01 15:28:30 +00:00			`random_baseline = random_baselines[label]`
initial commit 2025-10-13 06:49:24 -07:00			`centered_result = (accuracy - 0.01 * random_baseline) / (1.0 - 0.01 * random_baseline)`
			`centered_results[label] = centered_result`
			`end_time = time.time()`
			`print0(f"accuracy: {accuracy:.4f} \| centered: {centered_result:.4f} \| time: {end_time - start_time:.2f}s")`

			`core_metric = sum(centered_results.values()) / len(centered_results)`
			`out = {`
			`"results": results,`
			`"centered_results": centered_results,`
			`"core_metric": core_metric`
			`}`
			`return out`

			`# -----------------------------------------------------------------------------`
			`# HuggingFace loading utilities and light wrappers for a model`

			`class ModelWrapper:`
			`"""Lightweight wrapper for a HuggingFace model"""`
			`def __init__(self, model, max_seq_len=None):`
			`self.model = model`
			`self.max_seq_len = max_seq_len`

			`def __call__(self, input_ids):`
			`outputs = self.model(input_ids)`
			`logits = outputs.logits`
			`return logits`

			`def load_hf_model(hf_path: str, device):`
			`print0(f"Loading model from: {hf_path}")`
			`# Load the model`
			`from transformers import AutoModelForCausalLM`
			`model = AutoModelForCausalLM.from_pretrained(hf_path)`
			`model.to(device)`
			`model.eval()`
			`max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None`
			`model = ModelWrapper(model, max_seq_len=max_seq_len)`
			`# Load the tokenizer`
			`tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path)`
			`return model, tokenizer`

			`# -----------------------------------------------------------------------------`
			`def main():`
bugfix keep same args style in scripts/base_eval.py 2025-12-26 17:29:08 +08:00			`import argparse`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path to evaluate')`
			`parser.add_argument('--max-per-task', type=int, default=-1, help='Max examples per task to evaluate (-1 = disable)')`
bugfix 2025-12-26 17:41:57 +08:00			`parser.add_argument('--model-tag', type=str, default=None, help='optional model tag for the output directory name')`
			`parser.add_argument('--step', type=str, default=None, help='optional model step for the output directory name')`
bugfix keep same args style in scripts/base_eval.py 2025-12-26 17:29:08 +08:00			`args = parser.parse_args()`

initial commit 2025-10-13 06:49:24 -07:00			`# distributed / precision setup`
add autodetect of device and related stuff. getting weird warnings/errors still, so wip 2025-10-16 10:26:19 -07:00			`device_type = autodetect_device_type()`
			`ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)`
many small tweaks. base, eval, core work now i think 2025-10-16 15:46:18 -07:00			`autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()`
initial commit 2025-10-13 06:49:24 -07:00
			`# Load model and tokenizer from command line or from file system`
bugfix keep same args style in scripts/base_eval.py 2025-12-26 17:29:08 +08:00			`if args.hf_path is not None:`
initial commit 2025-10-13 06:49:24 -07:00			`# atm assume that if a path is given, it's a huggingface model path`
bugfix keep same args style in scripts/base_eval.py 2025-12-26 17:29:08 +08:00			`hf_path = args.hf_path`
initial commit 2025-10-13 06:49:24 -07:00			`print0(f"Loading huggingface model from: {hf_path}")`
			`model, tokenizer = load_hf_model(hf_path, device)`
			`model_name = hf_path # just for logging`
			`model_slug = hf_path.replace("/", "-") # for the output csv file`
			`else:`
			`# load a local model from the file system`
bugfix 2025-12-26 17:41:57 +08:00			`model, tokenizer, meta = load_model("base", device, phase="eval", model_tag=args.model_tag, step=args.step)`
initial commit 2025-10-13 06:49:24 -07:00			`model_name = f"base_model (step {meta['step']})" # just for logging`
			`model_slug = f"base_model_{meta['step']:06d}" # for the output csv file`

			`# Evaluate the model`
			`with autocast_ctx:`
bugfix keep same args style in scripts/base_eval.py 2025-12-26 17:29:08 +08:00			`out = evaluate_model(model, tokenizer, device, max_per_task=args.max_per_task)`
initial commit 2025-10-13 06:49:24 -07:00
			`# Write out the results to a csv file`
			`core_metric = None`
			`centered_results = {}`
			`if ddp_rank == 0:`
			`base_dir = get_base_dir()`
			`output_csv_path = os.path.join(base_dir, "base_eval", f"{model_slug}.csv")`
			`os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)`
			`results = out["results"]`
			`centered_results = out["centered_results"]`
			`core_metric = out["core_metric"]`
fix: open JSONL and results CSV with UTF-8 encoding for portability 2025-11-03 01:20:56 -05:00			`with open(output_csv_path, 'w', encoding='utf-8', newline='') as f:`
initial commit 2025-10-13 06:49:24 -07:00			`f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n")`
			`for label in results:`
			`f.write(f"{label:<35}, {results[label]:<10.6f}, {centered_results[label]:<10.6f}\n")`
			`f.write(f"{'CORE':<35}, {'':<10}, {core_metric:<10.6f}\n")`
			`# Print the content of the csv file to console too`
			`print0("="*80)`
			`print0(f"Model: {model_name}")`
			`print0("="*80)`
add explicit UTF-8 encoding 2025-11-03 21:27:12 +01:00			`with open(output_csv_path, 'r', encoding='utf-8') as f:`
initial commit 2025-10-13 06:49:24 -07:00			`print0(f.read())`

			`# Log to report`
			`from nanochat.report import get_report`
			`get_report().log(section="Base model evaluation", data=[`
			`{`
			`"Model": model_name,`
			`"CORE metric": core_metric,`
			`},`
			`centered_results, # the full table`
			`])`

			`compute_cleanup()`

			`if __name__ == "__main__":`
			`main()`