实例流程
显存优化
## BS=BatchSize 、 GA=梯度累加
## BS=1,GA=32 同等 BS=32
def process_function(examples):tokenized_examples = tokenizer(examples["review"], max_length=32, truncation=True, padding="max_length")tokenized_examples["labels"] = examples["label"]return tokenized_examplestrain_args = TrainingArguments(output_dir="./checkpoints", # 输出文件夹per_device_train_batch_size=2, # 训练时的batch_sizegradient_accumulation_steps=32, # *** 梯度累加 ***gradient_checkpointing=True, # *** 梯度检查点 *** 选择性保存激活值,未保存的在反向传播出,重新计算optim="adafactor", # *** adafactor优化器 *** per_device_eval_batch_size=4, # 验证时的batch_sizenum_train_epochs=1, # 训练轮数logging_steps=10, # log 打印的频率eval_strategy="epoch", # 评估策略save_strategy="epoch", # 保存策略save_total_limit=3, # 最大保存数learning_rate=2e-5, # 学习率weight_decay=0.001, # weight_decaymetric_for_best_model="f1", # 设定评估指标load_best_model_at_end=True) # 训练完成后加载最优模型
train_args# *** 参数冻结 ***
for name, param in model.bert.named_parameters():param.requires_grad = False