接上篇,我们继续深入解析YOLO v1的核心实现,重点关注损失函数设计和训练流程。
YOLO v1 损失函数
YOLO v1的损失函数是整个算法的核心,它需要同时优化目标定位和分类任务。以下是损失函数的实现:
python
class YOLOLoss(nn.Module):def __init__(self, S=7, B=2, C=20, lambda_coord=5, lambda_noobj=0.5):super(YOLOLoss, self).__init__()self.S = S # 网格大小self.B = B # 每个网格预测的边界框数量self.C = C # 类别数量self.lambda_coord = lambda_coord # 坐标损失权重self.lambda_noobj = lambda_noobj # 无目标网格的置信度损失权重self.mse = nn.MSELoss(reduction='sum')def forward(self, predictions, targets):"""YOLO损失函数计算Args:predictions: 模型预测输出 [batch_size, S, S, B*5+C]targets: 真实标签 [batch_size, S, S, 5+C],格式为[obj_flag, x, y, w, h, class_probs]Returns:损失值"""batch_size = predictions.size(0)# 将预测结果和真实值拆分成相关组件# 预测的边界框1: [batch_size, S, S, 5]pred_box1 = predictions[..., :5]# 预测的边界框2: [batch_size, S, S, 5]pred_box2 = predictions[..., 5:10]# 预测的类别概率: [batch_size, S, S, C]pred_classes = predictions[..., 10:]# 真实的边界框: [batch_size, S, S, 4]target_box = targets[..., 1:5]# 真实的类别概率: [batch_size, S, S, C]target_classes = targets[..., 5:]# 是否有目标存在于网格: [batch_size, S, S, 1]obj_mask = targets[..., 0].unsqueeze(3)noobj_mask = 1 - obj_mask# 计算预测框1和预测框2与真实框的IoUiou_box1 = calculate_iou(pred_box1[..., :4], target_box)iou_box2 = calculate_iou(pred_box2[..., :4], target_box)# 获取IoU更高的预测框iou_maxes, best_box = torch.max(torch.stack([iou_box1, iou_box2], dim=0), dim=0)# 创建负责预测的框的掩码(1表示第一个框,0表示第二个框)best_box_mask = best_box.unsqueeze(3)# 获取负责预测的框responsible_box = best_box_mask * pred_box1 + (1 - best_box_mask) * pred_box2# 1. 坐标损失(仅对有目标的网格)# 1.1 中心坐标(x,y)损失center_loss = self.mse(responsible_box[..., :2] * obj_mask,target_box[..., :2] * obj_mask)# 1.2 宽高(w,h)损失 - 使用平方根来减小大框和小框之间的差异width_height_loss = self.mse(torch.sign(responsible_box[..., 2:4]) * torch.sqrt(torch.abs(responsible_box[..., 2:4]) + 1e-6) * obj_mask,torch.sqrt(target_box[..., 2:4] + 1e-6) * obj_mask)# 2. 目标存在的网格的置信度损失# 2.1 有目标的框的置信度损失confidence_target = obj_mask * iou_maxes.unsqueeze(3)obj_confidence_loss = self.mse(responsible_box[..., 4:5] * obj_mask,confidence_target)# 2.2 无目标的框的置信度损失noobj_confidence_loss = self.mse(pred_box1[..., 4:5] * noobj_mask,torch.zeros_like(pred_box1[..., 4:5])) + self.mse(pred_box2[..., 4:5] * noobj_mask,torch.zeros_like(pred_box2[..., 4:5]))# 3. 类别概率损失class_loss = self.mse(pred_classes * obj_mask,target_classes * obj_mask)# 总损失 = 坐标损失 + 置信度损失 + 类别损失total_loss = (self.lambda_coord * (center_loss + width_height_loss) +obj_confidence_loss +self.lambda_noobj * noobj_confidence_loss +class_loss)return total_loss / batch_size
IoU计算函数
IoU (Intersection over Union) 是衡量两个边界框重叠程度的指标。在损失函数中,我们需要计算预测框和真实框的IoU:
python
def calculate_iou(pred_boxes, target_boxes):"""计算预测框和目标框之间的IoUArgs:pred_boxes: 预测框坐标 [batch_size, S, S, 4] (x, y, w, h)target_boxes: 目标框坐标 [batch_size, S, S, 4] (x, y, w, h)Returns:iou: [batch_size, S, S, 1]"""# 将(x, y, w, h)转换为(x1, y1, x2, y2)格式pred_x1 = pred_boxes[..., 0:1] - pred_boxes[..., 2:3] / 2pred_y1 = pred_boxes[..., 1:2] - pred_boxes[..., 3:4] / 2pred_x2 = pred_boxes[..., 0:1] + pred_boxes[..., 2:3] / 2pred_y2 = pred_boxes[..., 1:2] + pred_boxes[..., 3:4] / 2target_x1 = target_boxes[..., 0:1] - target_boxes[..., 2:3] / 2target_y1 = target_boxes[..., 1:2] - target_boxes[..., 3:4] / 2target_x2 = target_boxes[..., 0:1] + target_boxes[..., 2:3] / 2target_y2 = target_boxes[..., 1:2] + target_boxes[..., 3:4] / 2# 计算交集区域x1 = torch.max(pred_x1, target_x1)y1 = torch.max(pred_y1, target_y1)x2 = torch.min(pred_x2, target_x2)y2 = torch.min(pred_y2, target_y2)# 计算交集面积,确保宽高不为负intersection = torch.clamp(x2 - x1, min=0) * torch.clamp(y2 - y1, min=0)# 计算两个框各自的面积pred_area = (pred_x2 - pred_x1) * (pred_y2 - pred_y1)target_area = (target_x2 - target_x1) * (target_y2 - target_y1)# 计算并集面积union = pred_area + target_area - intersection + 1e-6 # 添加小值避免除零# 计算IoUiou = intersection / unionreturn iou
数据预处理与标签转换
在训练YOLO模型时,需要将原始标注数据转换为YOLO格式的标签:
python
def convert_to_yolo_format(boxes, labels, image_size, S=7, C=20):"""将边界框和标签转换为YOLO格式Args:boxes: 原始边界框坐标 [N, 4] (x1, y1, x2, y2),绝对像素坐标labels: 类别标签 [N]image_size: 图像大小 (height, width)S: 网格大小C: 类别数量Returns:target: YOLO格式的目标标签 [S, S, 5+C]"""target = torch.zeros((S, S, 5 + C))# 将绝对坐标转换为相对坐标boxes_rel = boxes.clone()boxes_rel[:, [0, 2]] /= image_size[1] # x坐标除以宽度boxes_rel[:, [1, 3]] /= image_size[0] # y坐标除以高度# 将(x1, y1, x2, y2)转换为(x, y, w, h)boxes_xywh = torch.zeros_like(boxes_rel)boxes_xywh[:, 0] = (boxes_rel[:, 0] + boxes_rel[:, 2]) / 2 # 中心xboxes_xywh[:, 1] = (boxes_rel[:, 1] + boxes_rel[:, 3]) / 2 # 中心yboxes_xywh[:, 2] = boxes_rel[:, 2] - boxes_rel[:, 0] # 宽度boxes_xywh[:, 3] = boxes_rel[:, 3] - boxes_rel[:, 1] # 高度# 对于每个边界框for i in range(len(boxes)):# 确定中心点所在网格grid_x = int(S * boxes_xywh[i, 0])grid_y = int(S * boxes_xywh[i, 1])# 确保网格索引在合法范围内grid_x = min(grid_x, S - 1)grid_y = min(grid_y, S - 1)# 计算中心点相对于网格的偏移x_offset = boxes_xywh[i, 0] * S - grid_xy_offset = boxes_xywh[i, 1] * S - grid_y# 设置目标有无标志target[grid_y, grid_x, 0] = 1# 设置边界框相对于网格的坐标target[grid_y, grid_x, 1] = x_offsettarget[grid_y, grid_x, 2] = y_offsettarget[grid_y, grid_x, 3] = boxes_xywh[i, 2] # 宽度target[grid_y, grid_x, 4] = boxes_xywh[i, 3] # 高度# 设置类别概率(one-hot编码)target[grid_y, grid_x, 5 + labels[i]] = 1return target
数据增强
为提高模型泛化能力,YOLO v1使用多种数据增强技术:
python
def data_augmentation(image, boxes):"""对图像和边界框进行数据增强Args:image: 输入图像 [H, W, 3]boxes: 边界框 [N, 4] (x1, y1, x2, y2)Returns:augmented_image: 增强后的图像augmented_boxes: 增强后的边界框"""augmented_image = image.copy()augmented_boxes = boxes.clone()# 1. 随机调整亮度、对比度、饱和度和色调if random.random() < 0.5:augmented_image = torchvision.transforms.functional.adjust_brightness(augmented_image, random.uniform(0.8, 1.2))if random.random() < 0.5:augmented_image = torchvision.transforms.functional.adjust_contrast(augmented_image, random.uniform(0.8, 1.2))if random.random() < 0.5:augmented_image = torchvision.transforms.functional.adjust_saturation(augmented_image, random.uniform(0.8, 1.2))if random.random() < 0.5:augmented_image = torchvision.transforms.functional.adjust_hue(augmented_image, random.uniform(-0.1, 0.1))# 2. 随机水平翻转if random.random() < 0.5:augmented_image = torchvision.transforms.functional.hflip(augmented_image)width = augmented_image.shape[1]augmented_boxes[:, 0] = width - augmented_boxes[:, 0] - 1 # 反转x1augmented_boxes[:, 2] = width - augmented_boxes[:, 2] - 1 # 反转x2# 交换x1和x2以保持x1 < x2augmented_boxes[:, [0, 2]] = augmented_boxes[:, [2, 0]]# 3. 随机剪裁和缩放if random.random() < 0.5:height, width = augmented_image.shape[:2]scale = random.uniform(0.8, 1.0)new_height = int(height * scale)new_width = int(width * scale)# 随机选择剪裁区域x_offset = random.randint(0, width - new_width)y_offset = random.randint(0, height - new_height)# 剪裁图像augmented_image = augmented_image[y_offset:y_offset+new_height, x_offset:x_offset+new_width]# 调整边界框augmented_boxes[:, 0] = (augmented_boxes[:, 0] - x_offset) * width / new_widthaugmented_boxes[:, 1] = (augmented_boxes[:, 1] - y_offset) * height / new_heightaugmented_boxes[:, 2] = (augmented_boxes[:, 2] - x_offset) * width / new_widthaugmented_boxes[:, 3] = (augmented_boxes[:, 3] - y_offset) * height / new_height# 确保边界框在有效范围内augmented_boxes[:, 0] = torch.clamp(augmented_boxes[:, 0], min=0, max=width-1)augmented_boxes[:, 1] = torch.clamp(augmented_boxes[:, 1], min=0, max=height-1)augmented_boxes[:, 2] = torch.clamp(augmented_boxes[:, 2], min=0, max=width-1)augmented_boxes[:, 3] = torch.clamp(augmented_boxes[:, 3], min=0, max=height-1)# 4. 最后调整大小到标准尺寸augmented_image = cv2.resize(augmented_image, (448, 448))return augmented_image, augmented_boxes
训练流程
以下是完整的YOLO v1训练流程:
python
def train_yolo(model, train_loader, val_loader, num_epochs=135, lr=0.0001):"""训练YOLO模型Args:model: YOLO模型train_loader: 训练数据加载器val_loader: 验证数据加载器num_epochs: 训练轮数lr: 初始学习率"""# 设置损失函数和优化器criterion = YOLOLoss(S=7, B=2, C=20)optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)# 学习率调度器scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[75, 105], gamma=0.1)# 开始训练for epoch in range(num_epochs):model.train()epoch_loss = 0for batch_idx, (images, targets) in enumerate(train_loader):# 图像预处理images = images.to(device) # [batch_size, 3, 448, 448]targets = targets.to(device) # [batch_size, S, S, 5+C]# 前向传播predictions = model(images)# 计算损失loss = criterion(predictions, targets)# 反向传播和优化optimizer.zero_grad()loss.backward()optimizer.step()# 累积损失epoch_loss += loss.item()# 打印训练进度if batch_idx % 50 == 0:print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx}/{len(train_loader)}], Loss: {loss.item():.4f}")# 更新学习率scheduler.step()# 计算平均损失avg_loss = epoch_loss / len(train_loader)print(f"Epoch [{epoch+1}/{num_epochs}], Average Loss: {avg_loss:.4f}")# 每5个epoch验证一次if (epoch + 1) % 5 == 0:validate_yolo(model, val_loader, criterion)# 保存模型if (epoch + 1) % 10 == 0:torch.save(model.state_dict(), f"yolo_epoch_{epoch+1}.pth")# 保存最终模型torch.save(model.state_dict(), "yolo_final.pth")
验证流程
在训练过程中,定期验证模型性能:
python
def validate_yolo(model, val_loader, criterion):"""验证YOLO模型性能Args:model: YOLO模型val_loader: 验证数据加载器criterion: 损失函数"""model.eval()val_loss = 0all_detections = []all_ground_truths = []with torch.no_grad():for images, targets in val_loader:images = images.to(device)targets = targets.to(device)# 前向传播predictions = model(images)# 计算损失loss = criterion(predictions, targets)val_loss += loss.item()# 解码预测结果batch_size = images.size(0)for i in range(batch_size):# 获取预测结果pred = predictions[i]boxes, class_ids, scores = decode_predictions(pred.unsqueeze(0), # 添加批次维度S=7, B=2, C=20, confidence_threshold=0.1)# 应用NMSkeep_indices = non_maximum_suppression(boxes, scores, iou_threshold=0.5)if len(keep_indices) > 0:boxes = boxes[keep_indices]class_ids = class_ids[keep_indices]scores = scores[keep_indices]# 存储预测结果detections = []for j in range(len(boxes)):detections.append({'bbox': boxes[j].cpu().numpy(),'class_id': class_ids[j].item(),'confidence': scores[j].item()})all_detections.append(detections)else:all_detections.append([])# 获取真实标签target = targets[i]ground_truths = []for cy in range(7):for cx in range(7):if target[cy, cx, 0] > 0: # 有目标存在# 解码真实框x = (target[cy, cx, 1] + cx) / 7y = (target[cy, cx, 2] + cy) / 7w = target[cy, cx, 3]h = target[cy, cx, 4]# 转换为[x1, y1, x2, y2]格式x1 = max(0, x - w/2)y1 = max(0, y - h/2)x2 = min(1, x + w/2)y2 = min(1, y + h/2)# 找到类别class_probs = target[cy, cx, 5:]class_id = torch.argmax(class_probs).item()ground_truths.append({'bbox': [x1, y1, x2, y2],'class_id': class_id})all_ground_truths.append(ground_truths)# 计算平均验证损失avg_val_loss = val_loss / len(val_loader)print(f"Validation Loss: {avg_val_loss:.4f}")# 计算mAPmAP = calculate_mAP(all_detections, all_ground_truths)print(f"mAP: {mAP:.4f}")return avg_val_loss, mAP
计算mAP
为评估模型性能,需要计算mAP (mean Average Precision):
python
def calculate_mAP(all_detections, all_ground_truths, iou_threshold=0.5, num_classes=20):"""计算mAP (mean Average Precision)Args:all_detections: 所有预测结果all_ground_truths: 所有真实标签iou_threshold: IoU阈值num_classes: 类别数量Returns:mAP: 平均精度均值"""# 按类别收集所有的预测和真实标签class_predictions = [[] for _ in range(num_classes)]class_ground_truths = [[] for _ in range(num_classes)]# 遍历所有图像for img_idx in range(len(all_detections)):# 获取当前图像的预测和真实标签detections = all_detections[img_idx]ground_truths = all_ground_truths[img_idx]# 处理该图像中的所有预测框for detection in detections:class_id = detection['class_id']confidence = detection['confidence']bbox = detection['bbox']class_predictions[class_id].append({'confidence': confidence,'bbox': bbox,'img_idx': img_idx,'matched': False # 标记是否已匹配})# 处理该图像中的所有真实框for gt in ground_truths:class_id = gt['class_id']bbox = gt['bbox']class_ground_truths[class_id].append({'bbox': bbox,'img_idx': img_idx,'matched': False # 标记是否已匹配})# 计算每个类别的APAPs = []for class_id in range(num_classes):# 获取当前类别的预测和真实标签predictions = class_predictions[class_id]ground_truths = class_ground_truths[class_id]# 如果没有真实标签,则跳过if len(ground_truths) == 0:continue# 按置信度从高到低排序预测框predictions.sort(key=lambda x: x['confidence'], reverse=True)# 计算精度和召回率点precisions = []recalls = []tp = 0 # 真正例数量fp = 0 # 假正例数量total_gt = len(ground_truths)for pred_idx, pred in enumerate(predictions):img_idx = pred['img_idx']pred_bbox = pred['bbox']# 寻找匹配的真实框matched_gt = Falsefor gt in ground_truths:if gt['img_idx'] == img_idx and not gt['matched']:gt_bbox = gt['bbox']# 计算IoUx1 = max(pred_bbox[0], gt_bbox[0])y1 = max(pred_bbox[1], gt_bbox[1])x2 = min(pred_bbox[2], gt_bbox[2])y2 = min(pred_bbox[3], gt_bbox[3])# 计算交集面积intersection = max(0, x2 - x1) * max(0, y2 - y1)# 计算各自的面积pred_area = (pred_bbox[2] - pred_bbox[0]) * (pred_bbox[3] - pred_bbox[1])gt_area = (gt_bbox[2] - gt_bbox[0]) * (gt_bbox[3] - gt_bbox[1])# 计算并集面积union = pred_area + gt_area - intersection# 计算IoUiou = intersection / unionif iou >= iou_threshold:matched_gt = Truegt['matched'] = Truebreak# 更新TP和FPif matched_gt:tp += 1else:fp += 1# 计算精度和召回率precision = tp / (tp + fp)recall = tp / total_gtprecisions.append(precision)recalls.append(recall)# 计算AP (11点插值法)AP = 0for t in np.arange(0, 1.1, 0.1):if np.sum(np.array(recalls) >= t) == 0:p = 0else:p = np.max(np.array(precisions)[np.array(recalls) >= t])AP += p / 11APs.append(AP)# 计算mAPmAP = np.mean(APs)return mAP
完整训练流程示例
下面是使用上述代码组件进行训练的完整示例:
python
if __name__ == "__main__":# 创建模型model = YOLOv1(S=7, B=2, C=20)model = model.to(device)# 创建数据集和数据加载器transform = transforms.Compose([transforms.Resize((448, 448)),transforms.ToTensor(),transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])# 这里使用VOC数据集作为示例train_dataset = VOCDataset(root="path/to/VOC2007",year="2007",image_set="train",transform=transform,S=7, B=2, C=20)val_dataset = VOCDataset(root="path/to/VOC2007",year="2007",image_set="val",transform=transform,S=7, B=2, C=20)train_loader = DataLoader(train_dataset,batch_size=16,shuffle=True,num_workers=4,pin_memory=True,drop_last=True)val_loader = DataLoader(val_dataset,batch_size=16,shuffle=False,num_workers=4,pin_memory=True,drop_last=False)# 训练模型train_yolo(model=model,train_loader=train_loader,val_loader=val_loader,num_epochs=135,lr=0.0001)
总结
本文详细解析了YOLO v1的损失函数设计和训练流程