wget https://raw.githubusercontent.com/NAISS-Training/ai-intro/refs/heads/main/docs/demos/tf_basics.ipynb

wget https://raw.githubusercontent.com/NAISS-Training/ai-intro/refs/heads/main/docs/demos/tf-container.sh
mkdir -p ~/portal/jupyter/
mv tf-bundle-container.sh ~/portal/jupyter/

import os
import pickle
from glob import iglob

import numpy as np
from tensorflow.data import Dataset


def get_cifar10(datadir: str = "/mimer/NOBACKUP/Datasets/CIFAR/cifar-10-batches-py") -> Dataset:
    x, y = [], []
    for file in iglob(os.path.join(datadir, "data_batch_*")):
        with open(file, "rb") as f:
            d = pickle.load(f, encoding="bytes")
        x.append(d[b"data"])
        y.extend(d[b"labels"])

    x = np.concatenate(x).reshape(-1, 3, 32, 32).transpose(0,2,3,1)
    y = np.array(y)

    return Dataset.from_tensor_slices((x, y))

from tensorflow.keras import layers, models, Model

def get_model(
    *,
    num_classes: int,
    input_shape: tuple[int, ...] = (32, 32, 3),
) -> Model:
    return models.Sequential([
        layers.Conv2D(64, 3, padding="same", activation="relu", input_shape=(32, 32, 3)),
        layers.Conv2D(64, 3, padding="same", activation="relu"),
        layers.MaxPooling2D(),

        layers.Conv2D(128, 3, padding="same", activation="relu"),
        layers.Conv2D(128, 3, padding="same", activation="relu"),
        layers.MaxPooling2D(),

        layers.Conv2D(256, 3, padding="same", activation="relu"),
        layers.Conv2D(256, 3, padding="same", activation="relu"),
        layers.GlobalMaxPooling2D(),

        layers.Dense(1024, activation="relu"),
        layers.Dense(num_classes),
    ])

import tensorflow as tf
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.optimizers import AdamW

# Set seed for reproducibility
# see also https://www.tensorflow.org/api_docs/python/tf/config/experimental/enable_op_determinism
tf.random.set_seed(10037)

# Prepare dataset (this is a tiny dataset so we are generous with the shuffling)
dataset = get_cifar10()
dataset = dataset.shuffle(buffer_size=dataset.cardinality(), reshuffle_each_iteration=True)

# Prepare model, optimizer and metrics
model = get_model(num_classes=10)

model.compile(
    optimizer=tf.keras.optimizers.AdamW(learning_rate=1e-3),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[
        tf.keras.metrics.SparseCategoricalAccuracy(name="acc"),
    ],
)

# Add checkpointing callback
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    filepath="tf_checkpoints/epoch_{epoch:02d}",
    save_weights_only=True,
    save_best_only=False,
)

# Run the training
history = model.fit(
    dataset.batch(512).prefetch(tf.data.AUTOTUNE),
    epochs=5,
    callbacks=[checkpoint_cb],
)

# The stderr is unfortunately spammed due to
# - https://github.com/tensorflow/tensorflow/issues/73487
# but I don't want to filter it entirely in case there are some relevant warnings

# TensorFlow reserves most of the VRAM on start-up, so external monitoring doesn't work.
# This is one way to do it.
for gpu in tf.config.list_physical_devices("GPU"):
    # https://www.tensorflow.org/api_docs/python/tf/config/experimental/get_memory_info
    peak_vram = tf.config.experimental.get_memory_info(gpu.name[-5:])['peak']
    print(f"Peak memory used by {gpu.name}: {peak_vram / 1024**3:.2f} GiB")

Basics of running TensorFlow on NAISS systems¶

Getting the notebook¶

Setting up the software environment¶

Alvis Open OnDemand Runtime¶

Dataset set-up¶

Model set-up¶

Training set-up¶

Exercises¶