import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6,7"
import argparse
import torch as th
import pytorch_lightning as pl
from transformers import GPT2LMHeadModel
from pytorch_lightning import Trainer, loggers
from pytorch_lightning.callbacks import ModelCheckpoint
from transformers.optimization import get_linear_schedule_with_warmup
from dataset import GPT2DataModel
class GPT2FinetuneMedicalQAModelCheckpoint:
@staticmethod
def add_argparse_args(parent_args):
parser = parent_args.add_argument_group('BaseModel')
parser.add_argument('--monitor', default='train_loss', type=str)
parser.add_argument('--mode', default='min', type=str)
parser.add_argument('--dirpath', default='./ckpt/', type=str)
parser.add_argument('--filename', default='model-{epoch:02d}-{train_loss:.4f}', type=str)
parser.add_argument('--save_last', action='store_true', default=True)
parser.add_argument('--save_top_k', default=3, type=float)
parser.add_argument('--every_n_train_steps', default=1000, type=float)
parser.add_argument('--save_weights_only', default=True, type=bool)
return parent_args
def __init__(self, args):
self.callbacks = ModelCheckpoint(monitor=args.monitor, save_top_k=args.save_top_k, mode=args.mode,
save_weights_only=args.save_weights_only, dirpath=args.dirpath,
filename=args.filename, save_last=args.save_last)
class GPT2Finetune(pl.LightningModule):
@staticmethod
def add_model_specific_args(parent_args):
parser = parent_args.add_argument_group("BaseModel")
parser.add_argument("--learning_rate", default=1e-4, type=float)
parser.add_argument("--weight_decay", default=0.1, type=float)
parser.add_argument("--warmup", default=0.01, type=float)
return parent_args
def __init__(self, args, num_data):
super().__init__()
self.args = args
self.num_data = num_data
print('num_data:', num_data)
self.model = GPT2LMHeadModel.from_pretrained(args.pretrained_model_path)
def setup(self, stage) -> None:
if stage == 'fit':
num_gpus = self.trainer.gpus if self.trainer.gpus is not None else 0
self.total_step = int(self.trainer.max_epochs * self.num_data /
(max(1, num_gpus) * self.trainer.accumulate_grad_batches))
print('Total training step:', self.total_step)
def training_step(self, batch, batch_idx):
output = self.model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'],
labels=batch['labels'])
# output = self.model(input_ids=batch['input_ids'], labels=batch['labels'])
# acc = self.comput_metrix(output.logits, batch['labels'])
self.log('train_loss', output.loss)
return output.loss
def comput_metrix(self, logits, labels):
y_pred = th.argmax(logits, dim=-1)
y_pred = y_pred.view(size=(-1,))
y_true = labels.view(size=(-1,)).float()
corr = th.eq(y_pred, y_true)
acc = th.sum(corr.float()) / labels.size()[0]
return acc
def validation_step(self, batch, batch_idx):
output = self.model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'],
labels=batch['labels'])
self.log('val_loss', output.loss)
def configure_optimizers(self):
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
paras = list(filter(lambda p: p[1].requires_grad, self.named_parameters()))
paras = [{
'params':
[p for n, p in paras if not any(nd in n for nd in no_decay)],
'weight_decay': self.args.weight_decay
}, {
'params': [p for n, p in paras if any(nd in n for nd in no_decay)],
'weight_decay': 0.0
}]
optimizer = th.optim.AdamW(paras, lr=self.args.learning_rate)
scheduler = get_linear_schedule_with_warmup(
optimizer, int(self.total_step * self.args.warmup),
self.total_step)
return [{
'optimizer': optimizer,
'lr_scheduler': {
'scheduler': scheduler,
'interval': 'step',
'frequency': 1
}
}]
def main():
total_parser = argparse.ArgumentParser("Summary Task")
total_parser.add_argument('--do_eval_only', action='store_true', default=False)
total_parser.add_argument('--pretrained_model_path', default=None, type=str)
total_parser.add_argument('--output_save_path', default='./predict.json', type=str)
# * Args for data preprocessing
total_parser = GPT2DataModel.add_data_specific_args(total_parser)
# * Args for training
total_parser = Trainer.add_argparse_args(total_parser)
total_parser = GPT2FinetuneMedicalQAModelCheckpoint.add_argparse_args(total_parser)
total_parser = GPT2Finetune.add_model_specific_args(total_parser)
# * Args for base model
args = total_parser.parse_args()
data_model = GPT2DataModel(args)
model = GPT2Finetune(args, len(data_model.train_dataloader()))
checkpoint_callback = GPT2FinetuneMedicalQAModelCheckpoint(args).callbacks
logger = loggers.TensorBoardLogger(save_dir=os.path.join(args.default_root_dir, 'log/'), name='MedicalQA-GPT2')
trainer = Trainer.from_argparse_args(args, logger=logger, callbacks=[checkpoint_callback])
trainer.fit(model, data_model)
model.model.save_pretrained("./models/finetune/gpt2")
if __name__ == '__main__':
main()
$ bash finetune_gpt2.sh
++ date
+ echo 'START TIME: 2022年 08月 09日 星期二 18:49:15 CST'
START TIME: 2022年 08月 09日 星期二 18:49:15 CST
+ MICRO_BATCH_SIZE=1
++ pwd
+ ROOT_DIR=/home/liuzhaofeng/nlg_pipeline/gpt2/dialog
+ ZERO_STAGE=3
+ config_json=/home/liuzhaofeng/nlg_pipeline/gpt2/dialog/training_config.json
+ export MASTER_PORT=30021
+ MASTER_PORT=30021
+ cat
+ export PL_DEEPSPEED_CONFIG_PATH=/home/liuzhaofeng/nlg_pipeline/gpt2/dialog/training_config.json
+ PL_DEEPSPEED_CONFIG_PATH=/home/liuzhaofeng/nlg_pipeline/gpt2/dialog/training_config.json
+ TRAINER_ARGS='
--max_epochs 10 --gpus 8 --num_nodes 1 --strategy deepspeed_stage_3_offload --default_root_dir /home/liuzhaofeng/nlg_pipeline/gpt2/dialog --dirpath /home/liuzhaofeng/nlg_pipeline/gpt2/dialog/ckpt --save_top_k 3 --monitor train_loss --mode min --save_last '
+ DATA_DIR=/home/liuzhaofeng/nlg_pipeline/gpt2/dialog/datasets
+ DATA_ARGS='
--data_dir /home/liuzhaofeng/nlg_pipeline/gpt2/dialog/datasets --train_batchsize 1 --valid_batchsize 1 --train_data train.txt --valid_data valid.txt --test_data test.txt
'
+ PRETRAINED_MODEL_PATH=IDEA-CCNL/Wenzhong2.0-GPT2-3.5B-chinese
+ MODEL_ARGS='
--pretrained_model_path IDEA-CCNL/Wenzhong2.0-GPT2-3.5B-chinese --output_save_path /home/liuzhaofeng/nlg_pipeline/gpt2/dialog/predict.json --learning_rate 1e-4 --weight_decay 0.1 --warmup 0.01 '
+ SCRIPTS_PATH=/home/liuzhaofeng/nlg_pipeline/gpt2/dialog/finetune_gpt2.py
+ export 'CMD= /home/liuzhaofeng/nlg_pipeline/gpt2/dialog/finetune_gpt2.py
--max_epochs 10 --gpus 8 --num_nodes 1 --strategy deepspeed_stage_3_offload --default_root_dir /home/liuzhaofeng/nlg_pipeline/gpt2/dialog --dirpath /home/liuzhaofeng/nlg_pipeline/gpt2/dialog/ckpt --save_top_k 3 --monitor train_loss --mode min --save_last
--pretrained_model_path IDEA-CCNL/Wenzhong2.0-GPT2-3.5B-chinese --output_save_path /home/liuzhaofeng/nlg_pipeline/gpt2/dialog/predict.json --learning_rate 1e-4 --weight_decay 0.1 --warmup 0.01
--data_dir /home/liuzhaofeng/nlg_pipeline/gpt2/dialog/datasets --train_batchsize 1 --valid_batchsize 1 --train_data train.txt --valid_data valid.txt --test_data test.txt
'
+ CMD=' /home/liuzhaofeng/nlg_pipeline/gpt2/dialog/finetune_gpt2.py
--max_epochs 10 --gpus 8 --num_nodes 1 --strategy deepspeed_stage_3_offload --default_root_dir /home/liuzhaofeng/nlg_pipeline/gpt2/dialog --dirpath /home/liuzhaofeng/nlg_pipeline/gpt2/dialog/ckpt --save_top_k 3 --monitor train_loss --mode min --save_last
--pretrained_model_path IDEA-CCNL/Wenzhong2.0-GPT2-3.5B-chinese --output_save_path /home/liuzhaofeng/nlg_pipeline/gpt2/dialog/predict.json --learning_rate 1e-4 --weight_decay 0.1 --warmup 0.01
--data_dir /home/liuzhaofeng/nlg_pipeline/gpt2/dialog/datasets --train_batchsize 1 --valid_batchsize 1 --train_data train.txt --valid_data valid.txt --test_data test.txt
'
+ python /home/liuzhaofeng/nlg_pipeline/gpt2/dialog/finetune_gpt2.py --max_epochs 10 --gpus 8 --num_nodes 1 --strategy deepspeed_stage_3_offload --default_root_dir /home/liuzhaofeng/nlg_pipeline/gpt2/dialog --dirpath /home/liuzhaofeng/nlg_pipeline/gpt2/dialog/ckpt --save_top_k 3 --monitor train_loss --mode min --save_last --pretrained_model_path IDEA-CCNL/Wenzhong2.0-GPT2-3.5B-chinese --output_save_path /home/liuzhaofeng/nlg_pipeline/gpt2/dialog/predict.json --learning_rate 1e-4 --weight_decay 0.1 --warmup 0.01 --data_dir /home/liuzhaofeng/nlg_pipeline/gpt2/dialog/datasets --train_batchsize 1 --valid_batchsize 1 --train_data train.txt --valid_data valid.txt --test_data test.txt
Using pad_token, but it is not set yet.
训练集处理进度: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 234801/234801 [00:00<00:00, 1141178.50it/s]
Using pad_token, but it is not set yet.
验证集处理进度: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 744991.83it/s]
Using pad_token, but it is not set yet.
测试集处理进度: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 801970.17it/s]
num_data: 234801
Loading DeepSpeed config from set PL_DEEPSPEED_CONFIG_PATH environment variable
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Using pad_token, but it is not set yet.
训练集处理进度: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 234801/234801 [00:00<00:00, 1466089.81it/s]
Using pad_token, but it is not set yet.
Using pad_token, but it is not set yet.
验证集处理进度: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 725658.13it/s]
训练集处理进度: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 234801/234801 [00:00<00:00, 1313860.90it/s]
Using pad_token, but it is not set yet.
训练集处理进度: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 234801/234801 [00:00<00:00, 1346486.25it/s]
Using pad_token, but it is not set yet.
测试集处理进度: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 735842.81it/s]
num_data: 234801
Using pad_token, but it is not set yet.
验证集处理进度: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 719434.65it/s]
Using pad_token, but it is not set yet.
Using pad_token, but it is not set yet.
验证集处理进度: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 719434.65it/s]
训练集处理进度: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 234801/234801 [00:00<00:00, 1120199.53it/s]
Using pad_token, but it is not set yet.
测试集处理进度: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 747647.77it/s]
num_data: 234801
Using pad_token, but it is not set yet.
训练集处理进度: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 234801/234801 [00:00<00:00, 1271098.49it/s]
Using pad_token, but it is not set yet.
测试集处理进度: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 775287.25it/s]
num_data: 234801
initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/8
Using pad_token, but it is not set yet.
验证集处理进度: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 741043.11it/s]
Using pad_token, but it is not set yet.
训练集处理进度: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 234801/234801 [00:00<00:00, 1259098.78it/s]
Using pad_token, but it is not set yet.
验证集处理进度: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 782519.40it/s]
Using pad_token, but it is not set yet.
训练集处理进度: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 234801/234801 [00:00<00:00, 1224649.98it/s]
Using pad_token, but it is not set yet.
测试集处理进度: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 798915.05it/s]
num_data: 234801
Using pad_token, but it is not set yet.
验证集处理进度: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 700217.70it/s]
Using pad_token, but it is not set yet.
测试集处理进度: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 659481.76it/s]
num_data: 234801
Using pad_token, but it is not set yet.
验证集处理进度: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 661562.15it/s]
Using pad_token, but it is not set yet.
测试集处理进度: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 812849.61it/s]
num_data: 234801
Using pad_token, but it is not set yet.
测试集处理进度: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 679789.95it/s]
num_data: 234801
initializing deepspeed distributed: GLOBAL_RANK: 1, MEMBER: 2/8
initializing deepspeed distributed: GLOBAL_RANK: 3, MEMBER: 4/8
initializing deepspeed distributed: GLOBAL_RANK: 2, MEMBER: 3/8
initializing deepspeed distributed: GLOBAL_RANK: 4, MEMBER: 5/8
initializing deepspeed distributed: GLOBAL_RANK: 6, MEMBER: 7/8
initializing deepspeed distributed: GLOBAL_RANK: 5, MEMBER: 6/8
initializing deepspeed distributed: GLOBAL_RANK: 7, MEMBER: 8/8
Total training step: 293501
Total training step: 293501
Total training step: 293501
Total training step: 293501
Total training step: 293501
Total training step: 293501
Total training step: 293501
/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:611: UserWarning: Checkpoint directory /home/liuzhaofeng/nlg_pipeline/gpt2/dialog/ckpt exists and is not empty.
rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:2192: LightningDeprecationWarning: `Trainer.gpus` was deprecated in v1.6 and will be removed in v1.8. Please use `Trainer.num_devices` or `Trainer.device_ids` to get device information instead.
rank_zero_deprecation(
Total training step: 293501
LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
LOCAL_RANK: 4 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
LOCAL_RANK: 7 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
LOCAL_RANK: 5 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
LOCAL_RANK: 6 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]
You have specified an optimizer and/or scheduler within the DeepSpeed config. It is recommended to define it in `LightningModule.configure_optimizers`.
Using /home/liuzhaofeng/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
Using /home/liuzhaofeng/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
Using /home/liuzhaofeng/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
Using /home/liuzhaofeng/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
Using /home/liuzhaofeng/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
Using /home/liuzhaofeng/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
Using /home/liuzhaofeng/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
Using /home/liuzhaofeng/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/liuzhaofeng/.cache/torch_extensions/py39_cu113/cpu_adam/build.ninja...
Building extension module cpu_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module cpu_adam...
Time to load cpu_adam op: 3.354057550430298 seconds
Loading extension module cpu_adam...
Loading extension module cpu_adam...
Loading extension module cpu_adam...
Time to load cpu_adam op: 3.3590314388275146 seconds
Time to load cpu_adam op: 3.3760969638824463 seconds
Time to load cpu_adam op: 3.365596055984497 seconds
Loading extension module cpu_adam...
Time to load cpu_adam op: 3.258009910583496 seconds
Loading extension module cpu_adam...
Time to load cpu_adam op: 3.3222544193267822 seconds
Loading extension module cpu_adam...
Time to load cpu_adam op: 3.3342092037200928 seconds
Loading extension module cpu_adam...
Time to load cpu_adam op: 3.2041637897491455 seconds
Using /home/liuzhaofeng/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
Using /home/liuzhaofeng/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
Using /home/liuzhaofeng/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
Emitting ninja build file /home/liuzhaofeng/.cache/torch_extensions/py39_cu113/utils/build.ninja...
Building extension module utils...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Using /home/liuzhaofeng/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
Using /home/liuzhaofeng/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
Using /home/liuzhaofeng/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
Using /home/liuzhaofeng/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
Using /home/liuzhaofeng/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
ninja: no work to do.
Loading extension module utils...
Time to load utils op: 0.5016887187957764 seconds
Loading extension module utils...
Loading extension module utils...
Time to load utils op: 0.20363640785217285 seconds
Time to load utils op: 0.10370874404907227 seconds
Loading extension module utils...
Time to load utils op: 0.2036135196685791 seconds
Loading extension module utils...
Time to load utils op: 0.20850586891174316 seconds
Loading extension module utils...
Time to load utils op: 0.20370721817016602 seconds
Loading extension module utils...
Loading extension module utils...
Time to load utils op: 0.20325350761413574 seconds
Time to load utils op: 0.2022240161895752 seconds
Using /home/liuzhaofeng/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
Using /home/liuzhaofeng/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
Using /home/liuzhaofeng/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
Using /home/liuzhaofeng/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Loading extension module utils...
Using /home/liuzhaofeng/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Using /home/liuzhaofeng/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
Time to load utils op: 0.0008826255798339844 seconds
Time to load utils op: 0.0008733272552490234 seconds
Time to load utils op: 0.0008597373962402344 seconds
Time to load utils op: 0.0008990764617919922 seconds
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Using /home/liuzhaofeng/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
Time to load utils op: 0.0011043548583984375 seconds
Time to load utils op: 0.0009999275207519531 seconds
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.0008752346038818359 seconds
Using /home/liuzhaofeng/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.0008261203765869141 seconds
| Name | Type | Params
------------------------------------------
0 | model | GPT2LMHeadModel | 364
------------------------------------------
364 Trainable params
0 Non-trainable params
364 Total params
0.001 Total estimated model params size (MB)
Sanity Checking: 0it [00:00, ?it/s]/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:240: PossibleUserWarning: The dataloader, val_dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 48 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
rank_zero_warn(
/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:240: PossibleUserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 48 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
rank_zero_warn(
Epoch 0: 0%| | 0/29364 [00:00<?, ?it/s]/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/strategies/ddp.py:420: UserWarning: Error handling mechanism for deadlock detection is uninitialized. Skipping check.
rank_zero_warn("Error handling mechanism for deadlock detection is uninitialized. Skipping check.")
Traceback (most recent call last):
File "/home/liuzhaofeng/nlg_pipeline/gpt2/dialog/finetune_gpt2.py", line 147, in <module>
main()
File "/home/liuzhaofeng/nlg_pipeline/gpt2/dialog/finetune_gpt2.py", line 141, in main
trainer.fit(model, data_model)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 768, in fit
self._call_and_handle_interrupt(
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 721, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 809, in _fit_impl
results = self._run(model, ckpt_path=self.ckpt_path)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1234, in _run
results = self._run_stage()
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1321, in _run_stage
return self._run_train()
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1351, in _run_train
self.fit_loop.run()
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py", line 268, in advance
self._outputs = self.epoch_loop.run(self._data_fetcher)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 208, in advance
batch_output = self.batch_loop.run(batch, batch_idx)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 88, in advance
outputs = self.optimizer_loop.run(split_batch, optimizers, batch_idx)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/base.py", line 204, in run
self.advance(*args, **kwargs)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 203, in advance
result = self._run_optimization(
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 256, in _run_optimization
self._optimizer_step(optimizer, opt_idx, batch_idx, closure)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 369, in _optimizer_step
self.trainer._call_lightning_module_hook(
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1593, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/core/lightning.py", line 1644, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/core/optimizer.py", line 168, in step
step_output = self._strategy.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/strategies/ddp.py", line 278, in optimizer_step
optimizer_output = super().optimizer_step(optimizer, opt_idx, closure, model, **kwargs)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py", line 193, in optimizer_step
return self.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, **kwargs)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/deepspeed.py", line 70, in optimizer_step
closure_result = closure()
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 148, in __call__
self._result = self.closure(*args, **kwargs)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 134, in closure
step_output = self._step_fn()
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 427, in _training_step
training_step_output = self.trainer._call_strategy_hook("training_step", *step_kwargs.values())
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1763, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/strategies/ddp.py", line 341, in training_step
return self.model(*args, **kwargs)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 1588, in forward
loss = self.module(*inputs, **kwargs)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1148, in _call_impl
result = forward_call(*input, **kwargs)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/strategies/deepspeed.py", line 80, in forward
return super().forward(*inputs, **kwargs)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/pytorch_lightning/overrides/base.py", line 82, in forward
output = self.module.training_step(*inputs, **kwargs)
File "/home/liuzhaofeng/nlg_pipeline/gpt2/dialog/finetune_gpt2.py", line 76, in training_step
output = self.model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'],
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1148, in _call_impl
result = forward_call(*input, **kwargs)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 1058, in forward
transformer_outputs = self.transformer(
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1148, in _call_impl
result = forward_call(*input, **kwargs)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 901, in forward
outputs = block(
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1148, in _call_impl
result = forward_call(*input, **kwargs)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 438, in forward
feed_forward_hidden_states = self.mlp(hidden_states)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1148, in _call_impl
result = forward_call(*input, **kwargs)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 365, in forward
hidden_states = self.c_fc(hidden_states)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1137, in _call_impl
result = hook(self, input)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/deepspeed/runtime/zero/stage3.py", line 1408, in _pre_forward_module_hook
self.pre_sub_module_forward_function(module)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/deepspeed/runtime/zero/stage3.py", line 1520, in pre_sub_module_forward_function
self.param_coordinator.fetch_sub_module(sub_module)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/deepspeed/runtime/zero/stage3.py", line 448, in fetch_sub_module
self._all_gather(partitioned_params, async_op=False)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/deepspeed/runtime/zero/stage3.py", line 525, in _all_gather
handles = partitioned_params[0].all_gather(
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 596, in all_gather
return self._all_gather(param_list, async_op=async_op, hierarchy=hierarchy)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 705, in _all_gather
Traceback (most recent call last):
File "/home/liuzhaofeng/nlg_pipeline/gpt2/dialog/finetune_gpt2.py", line 147, in <module>
ret_value = self._allgather_params_coalesced(all_gather_list, hierarchy)
File "/datafile/liuzhaofeng/anaconda3/lib/python3.9/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 936, in _allgather_params_coalesced
main()
File "/home/liuzhaofeng/nlg_pipeline/gpt2/dialog/finetune_gpt2.py", line 141, in main
flat_tensor = torch.empty(tensor_size,
RuntimeError: CUDA out of memory. Tried to allocate 72.00 MiB (GPU 3; 10.92 GiB total capacity; 10.02 GiB already allocated; 32.69 MiB free; 10.16 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
由于1080Ti的显存是11G,而经过我们计算3.5B的模型仅参数就13G,因此想通过ZeRO-3将参数切分到8块GPU,但是我们脚本启动之后我们监控了GPU的显存变换,似乎并没有将模型进行切分。