faster rcnn训练可分为交替训练和多任务训练两种方式，后者损失的计算过程可参看train.py，

RPN

    # classification loss
    rpn_cls_score = tf.reshape(self.net.get_output('rpn_cls_score_reshape'),[-1,2])
    rpn_label = tf.reshape(self.net.get_output('rpn-data')[0],[-1])
    rpn_cls_score = tf.reshape(tf.gather(rpn_cls_score,tf.where(tf.not_equal(rpn_label,-1))),[-1,2])
    rpn_label = tf.reshape(tf.gather(rpn_label,tf.where(tf.not_equal(rpn_label,-1))),[-1])
    rpn_cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=rpn_cls_score, labels=rpn_label))
    # bounding box regression L1 loss
    rpn_bbox_pred = self.net.get_output('rpn_bbox_pred')
    rpn_bbox_targets = tf.transpose(self.net.get_output('rpn-data')[1],[0,2,3,1])
    rpn_bbox_inside_weights = tf.transpose(self.net.get_output('rpn-data')[2],[0,2,3,1])
    rpn_bbox_outside_weights = tf.transpose(self.net.get_output('rpn-data')[3],[0,2,3,1])
    rpn_smooth_l1 = self._modified_smooth_l1(3.0, rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights)
    rpn_loss_box = tf.reduce_mean(tf.reduce_sum(rpn_smooth_l1, reduction_indices=[1, 2, 3]))
    # R-CNN
    # classification loss
    cls_score = self.net.get_output('cls_score')
    label = tf.reshape(self.net.get_output('roi-data')[1],[-1])
    cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=cls_score, labels=label))
    # bounding box regression L1 loss
    bbox_pred = self.net.get_output('bbox_pred')
    bbox_targets = self.net.get_output('roi-data')[2]
    bbox_inside_weights = self.net.get_output('roi-data')[3]
    bbox_outside_weights = self.net.get_output('roi-data')[4]
    smooth_l1 = self._modified_smooth_l1(1.0, bbox_pred, bbox_targets, bbox_inside_weights, bbox_outside_weights)
    loss_box = tf.reduce_mean(tf.reduce_sum(smooth_l1, reduction_indices=[1]))
    # final loss
    loss = cross_entropy + loss_box + rpn_cross_entropy + rpn_loss_box

darknet损失的计算过程可参看network.c

void calc_network_cost(network *netp)
{

network net = *netp;
int i;
float sum = 0;
int count = 0;
for(i = 0; i < net.n; ++i){
    if(net.layers[i].cost){
        sum += net.layers[i].cost[0];
        ++count;
    }
}
*net.cost = sum/count;

}

pytorch-yolo2的损失函数可参考该函数region_loss.py，也可参考yolo2-pytorch项目中的darknet.py(line216行for training部分)

def forward(self, output, target):

    #output : BxAs*(4+1+num_classes)*H*W
    t0 = time.time()
    nB = output.data.size(0)
    nA = self.num_anchors
    nC = self.num_classes
    nH = output.data.size(2)
    nW = output.data.size(3)
    output   = output.view(nB, nA, (5+nC), nH, nW)#batch*anchor*(5+nc)*feature_map分辨率转换数据维度
    x    = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([0]))).view(nB, nA, nH, nW))
    y    = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([1]))).view(nB, nA, nH, nW))
    w    = output.index_select(2, Variable(torch.cuda.LongTensor([2]))).view(nB, nA, nH, nW)
    h    = output.index_select(2, Variable(torch.cuda.LongTensor([3]))).view(nB, nA, nH, nW)
    conf = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([4]))).view(nB, nA, nH, nW)) #网络输出的检测参数的结果
    cls  = output.index_select(2, Variable(torch.linspace(5,5+nC-1,nC).long().cuda()))
    cls  = cls.view(nB*nA, nC, nH*nW).transpose(1,2).contiguous().view(nB*nA*nH*nW, nC) #识别类别
    t1 = time.time()
    pred_boxes = torch.cuda.FloatTensor(4, nB*nA*nH*nW)
    grid_x = torch.linspace(0, nW-1, nW).repeat(nH,1).repeat(nB*nA, 1, 1).view(nB*nA*nH*nW).cuda()
    grid_y = torch.linspace(0, nH-1, nH).repeat(nW,1).t().repeat(nB*nA, 1, 1).view(nB*nA*nH*nW).cuda()
    anchor_w = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, torch.LongTensor([0])).cuda()
    anchor_h = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, torch.LongTensor([1])).cuda()
    anchor_w = anchor_w.repeat(nB, 1).repeat(1, 1, nH*nW).view(nB*nA*nH*nW)
    anchor_h = anchor_h.repeat(nB, 1).repeat(1, 1, nH*nW).view(nB*nA*nH*nW)
    pred_boxes[0] = x.data + grid_x
    pred_boxes[1] = y.data + grid_y #相对输出feature map栅格的坐标偏移
    pred_boxes[2] = torch.exp(w.data) * anchor_w
    pred_boxes[3] = torch.exp(h.data) * anchor_h #相对于anchor的宽高比
    pred_boxes = convert2cpu(pred_boxes.transpose(0,1).contiguous().view(-1,4)) #转换为nB*nA*nH*nW  乘以 4的二维向量
    t2 = time.time()
    nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf,tcls = build_targets(pred_boxes, target.data, self.anchors, nA, nC, \
                                                           nH, nW, self.noobject_scale, self.object_scale, self.thresh, self.seen)
    cls_mask = (cls_mask == 1)
    nProposals = int((conf > 0.25).sum().data[0])
    tx    = Variable(tx.cuda())
    ty    = Variable(ty.cuda())
    tw    = Variable(tw.cuda())
    th    = Variable(th.cuda())
    tconf = Variable(tconf.cuda())
    tcls  = Variable(tcls.view(-1)[cls_mask].long().cuda())
    coord_mask = Variable(coord_mask.cuda())
    conf_mask  = Variable(conf_mask.cuda().sqrt())
    cls_mask   = Variable(cls_mask.view(-1, 1).repeat(1,nC).cuda())
    cls        = cls[cls_mask].view(-1, nC)  
    t3 = time.time()
    loss_x = self.coord_scale * nn.MSELoss(size_average=False)(x*coord_mask, tx*coord_mask)/2.0
    loss_y = self.coord_scale * nn.MSELoss(size_average=False)(y*coord_mask, ty*coord_mask)/2.0
    loss_w = self.coord_scale * nn.MSELoss(size_average=False)(w*coord_mask, tw*coord_mask)/2.0
    loss_h = self.coord_scale * nn.MSELoss(size_average=False)(h*coord_mask, th*coord_mask)/2.0
    loss_conf = nn.MSELoss(size_average=False)(conf*conf_mask, tconf*conf_mask)/2.0 #边界框的置信度
    loss_cls = self.class_scale * nn.CrossEntropyLoss(size_average=False)(cls, tcls) #分类结果
    loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls

PyTorch-YOLOv3的损失函数可参考models.py文件

Get outputs

    x = torch.sigmoid(prediction[..., 0])  # Center x
    y = torch.sigmoid(prediction[..., 1])  # Center y
    w = prediction[..., 2]  # Width
    h = prediction[..., 3]  # Height
    pred_conf = torch.sigmoid(prediction[..., 4])  # Conf
    pred_cls = torch.sigmoid(prediction[..., 5:])  # Cls pred.
    # Calculate offsets for each grid
    grid_x = torch.arange(nG).repeat(nG, 1).view([1, 1, nG, nG]).type(FloatTensor)
    grid_y = torch.arange(nG).repeat(nG, 1).t().view([1, 1, nG, nG]).type(FloatTensor)
    scaled_anchors = FloatTensor([(a_w / stride, a_h / stride) for a_w, a_h in self.anchors])
    anchor_w = scaled_anchors[:, 0:1].view((1, nA, 1, 1))
    anchor_h = scaled_anchors[:, 1:2].view((1, nA, 1, 1))
    # Add offset and scale with anchors
    pred_boxes = FloatTensor(prediction[..., :4].shape)
    pred_boxes[..., 0] = x.data + grid_x
    pred_boxes[..., 1] = y.data + grid_y
    pred_boxes[..., 2] = torch.exp(w.data) * anchor_w
    pred_boxes[..., 3] = torch.exp(h.data) * anchor_h
    # Training
    if targets is not None:
        if x.is_cuda:
            self.mse_loss = self.mse_loss.cuda()
            self.bce_loss = self.bce_loss.cuda()
            self.ce_loss = self.ce_loss.cuda()
        nGT, nCorrect, mask, conf_mask, tx, ty, tw, th, tconf, tcls = build_targets(
            pred_boxes=pred_boxes.cpu().data,
            pred_conf=pred_conf.cpu().data,
            pred_cls=pred_cls.cpu().data,
            target=targets.cpu().data,
            anchors=scaled_anchors.cpu().data,
            num_anchors=nA,
            num_classes=self.num_classes,
            grid_size=nG,
            ignore_thres=self.ignore_thres,
            img_dim=self.image_dim,
        )
        nProposals = int((pred_conf > 0.5).sum().item())
        recall = float(nCorrect / nGT) if nGT else 1
        precision = float(nCorrect / nProposals)
        # Handle masks
        mask = Variable(mask.type(ByteTensor))
        conf_mask = Variable(conf_mask.type(ByteTensor))
        # Handle target variables
        tx = Variable(tx.type(FloatTensor), requires_grad=False)
        ty = Variable(ty.type(FloatTensor), requires_grad=False)
        tw = Variable(tw.type(FloatTensor), requires_grad=False)
        th = Variable(th.type(FloatTensor), requires_grad=False)
        tconf = Variable(tconf.type(FloatTensor), requires_grad=False)
        tcls = Variable(tcls.type(LongTensor), requires_grad=False)
        # Get conf mask where gt and where there is no gt
        conf_mask_true = mask
        conf_mask_false = conf_mask - mask
        # Mask outputs to ignore non-existing objects
        loss_x = self.mse_loss(x[mask], tx[mask])
        loss_y = self.mse_loss(y[mask], ty[mask])
        loss_w = self.mse_loss(w[mask], tw[mask])
        loss_h = self.mse_loss(h[mask], th[mask])
        loss_conf = self.bce_loss(pred_conf[conf_mask_false], tconf[conf_mask_false]) + self.bce_loss(
            pred_conf[conf_mask_true], tconf[conf_mask_true]
        )
        loss_cls = (1 / nB) * self.ce_loss(pred_cls[mask], torch.argmax(tcls[mask], 1))
        loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls

ssd.pytorch的损失函数可参考文件multibox_loss.py

Localization Loss (Smooth L1)

    # Shape: [batch,num_priors,4]
    pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
    loc_p = loc_data[pos_idx].view(-1, 4)
    loc_t = loc_t[pos_idx].view(-1, 4)
    loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False)
    # Compute max conf across batch for hard negative mining
    batch_conf = conf_data.view(-1, self.num_classes)
    loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1, 1))
    # Hard Negative Mining
    loss_c[pos] = 0  # filter out pos boxes for now
    loss_c = loss_c.view(num, -1)
    _, loss_idx = loss_c.sort(1, descending=True)
    _, idx_rank = loss_idx.sort(1)
    num_pos = pos.long().sum(1, keepdim=True)
    num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1)
    neg = idx_rank < num_neg.expand_as(idx_rank)
    # Confidence Loss Including Positive and Negative Examples
    pos_idx = pos.unsqueeze(2).expand_as(conf_data)
    neg_idx = neg.unsqueeze(2).expand_as(conf_data)
    conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1, self.num_classes)
    targets_weighted = conf_t[(pos+neg).gt(0)]
    loss_c = F.cross_entropy(conf_p, targets_weighted, size_average=False)
    # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
    N = num_pos.data.sum()
    loss_l /= N
    loss_c /= N
    return loss_l, loss_c