Hi, I am trying to use AdaptDL to adaptively change the batch size. However, I found that after using adaptDL, the training becomes much slower (6.656 iters/s v.s. 17.467 iters/s). However, according to the Pollux paper (OSDI), the system overhead should be small enough to be neglected. I found that the main overhead is in the "backward process" (loss.backward()
) by profiling the time of every line of code. I don't know if it is because of the backward hook or is there anything wrong with my code.
I am using Python 3.6.13, torch 1.10.2, CUDA 11.6 on an A100 GPU.
Here is the code to reproduce the problem on one single GPU. Although only one GPU is needed for the code, I use PyTorch DDP for a fair comparison with adaptDL.
The code with adaptDL:
import math
import os
import time
import sys
import torch
import torch.distributed as dist
import torch.nn as nn
import argparse
import torchvision
import torchvision.transforms as transforms
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.optim.lr_scheduler import StepLR
import adaptdl # Changed in step 1
import adaptdl.torch # Changed in step 1
MAX_GLOBAL_BATCHSIZE = 1024
def main():
# 0. set up distributed device and process group
rank = int(os.environ["ADAPTDL_REPLICA_RANK"])
adaptdl.torch.init_process_group("nccl" if torch.cuda.is_available()
else "gloo") # Changed in step 1
# 1. define network, optimizer and scheduler
device = torch.device("cuda:" + str(rank))
net = torchvision.models.vgg16()
net = net.to(device)
optimizer = torch.optim.SGD(
net.parameters(),
lr=0.001 * 2,
momentum=0.9,
weight_decay=0.0001,
nesterov=True,
)
scheduler = StepLR(optimizer, step_size=1)
net = adaptdl.torch.AdaptiveDataParallel(net, optimizer, scheduler) # Changed in step 1
# 2. load dataset
transform = transforms.Compose(
[transforms.RandomHorizontalFlip(),
transforms.RandomGrayscale(),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
trainset = torchvision.datasets.CIFAR10(root="./", train=True,
download=True, transform=transform)
train_loader = adaptdl.torch.AdaptiveDataLoader(trainset, drop_last=True, batch_size=args.batch_size) # Changed in step 2
if args.adaptive_batch_size != 0:
train_loader.autoscale_batch_size(MAX_GLOBAL_BATCHSIZE, local_bsz_bounds=(args.batch_size / 4 + 1, args.batch_size * 2)) # Changed in step 3, optional
# 3. define loss
criterion = nn.CrossEntropyLoss()
# use time stamp to calculate scale overhead
print("training iteration start : " + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
# 4. start to train
net.train()
best_acc = 0
epochs = 2
start_time = time.time()
for epoch in adaptdl.torch.remaining_epochs_until(epochs): # Changed in step 4
train_loss = correct = total = 0
for idx, (inputs, targets) in enumerate(train_loader):
inputs, targets = inputs.to(device), targets.to(device)
outputs = net(inputs)
loss = criterion(outputs, targets)
optimizer.zero_grad()
# print(f"{rank} starting backward ...")
loss.backward()
optimizer.step()
train_loss += loss.item()
total += targets.size(0)
correct += torch.eq(outputs.argmax(dim=1), targets).sum().item()
if rank == 0 and idx % 20 == 0:
print(
" == Train Epoch: [{:3}/{}] [{}/{}] | loss: {:.3f} | acc: {:6.3f}%".format(
idx + 1,
len(train_loader),
epoch,
epochs,
train_loss / (idx + 1),
100.0 * correct / total,
)
)
print("throughput: {:.3f} iters/s".format((epoch * len(train_loader) + idx + 1)/(time.time() - start_time)))
scheduler.step()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Launch DDP processes")
parser.add_argument('--local_rank', type=int, help='local rank of GPU')
parser.add_argument('-b', '--batch_size', type=int, default=64, help='local batch size')
parser.add_argument('--adaptive_batch_size', type=int, default=1, required=True,
help='autoscale batch size')
args = parser.parse_args()
main()
To run this, the command line is:
ADAPTDL_MASTER_ADDR=127.0.0.1 ADAPTDL_MASTER_PORT=22223 ADAPTDL_NUM_REPLICAS=1 ADAPTDL_REPLICA_RANK=0 ADAPTDL_CHECKPOINT_PATH=./ ADAPTDL_NUM_RESTARTS=0 python vgg16_adaptdl.py --adaptive_batch_size 0
The code without adaptDL:
import math
import os
import time
import sys
import torch
import torch.distributed as dist
import torch.nn as nn
import argparse
import torchvision
import torchvision.transforms as transforms
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.optim.lr_scheduler import StepLR
MAX_GLOBAL_BATCHSIZE = 1024
def main():
rank = int(os.environ["RANK"])
torch.distributed.init_process_group("nccl" if torch.cuda.is_available()
else "gloo") # Changed in step 1
# 1. define network, optimizer and scheduler
device = torch.device("cuda:" + str(rank))
net = torchvision.models.vgg16()
net = net.to(device)
optimizer = torch.optim.SGD(
net.parameters(),
lr=0.001 * 2,
momentum=0.9,
weight_decay=0.0001,
nesterov=True,
)
scheduler = StepLR(optimizer, step_size=1)
net = DDP(net) # Changed in step 1
# 2. load dataset
transform = transforms.Compose(
[transforms.RandomHorizontalFlip(),
transforms.RandomGrayscale(),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
trainset = torchvision.datasets.CIFAR10(root="./", train=True,
download=True, transform=transform)
# DistributedSampler
train_sampler = torch.utils.data.distributed.DistributedSampler(
trainset,
shuffle=True,
rank=0
)
train_loader = torch.utils.data.DataLoader(
trainset,
batch_size=args.batch_size,
num_workers=1,
pin_memory=True,
sampler=train_sampler,
drop_last=True,
)
criterion = nn.CrossEntropyLoss()
# use time stamp to calculate scale overhead
print("training iteration start : " + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
# 4. start to train
net.train()
best_acc = 0
epochs = 2
start_time = time.time()
for epoch in range(epochs): # Changed in step 4
train_loss = correct = total = 0
for idx, (inputs, targets) in enumerate(train_loader):
inputs, targets = inputs.to(device), targets.to(device)
outputs = net(inputs)
loss = criterion(outputs, targets)
optimizer.zero_grad()
# print(f"{rank} starting backward ...")
loss.backward()
optimizer.step()
train_loss += loss.item()
total += targets.size(0)
correct += torch.eq(outputs.argmax(dim=1), targets).sum().item()
if rank == 0 and idx % 20 == 0:
print(
" == Train Epoch: [{:3}/{}] [{}/{}] | loss: {:.3f} | acc: {:6.3f}%".format(
idx + 1,
len(train_loader),
epoch,
epochs,
train_loss / (idx + 1),
100.0 * correct / total,
)
)
print("throughput: {:.3f} iters/s".format((epoch * len(train_loader) + idx + 1)/(time.time() - start_time)))
scheduler.step()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Launch DDP processes")
parser.add_argument('--local_rank', type=int, help='local rank of GPU')
parser.add_argument('-b', '--batch_size', type=int, default=64, help='local batch size')
parser.add_argument('--adaptive_batch_size', type=int, default=1, required=True,
help='autoscale batch size')
args = parser.parse_args()
main()
To run this, the command line is:
MASTER_ADDR=127.0.0.1 MASTER_PORT=22222 RANK=0 WORLD_SIZE=1 python vgg16.py --adaptive_batch_size 0
I would really appreciate any help!