import torch import torch.nn as nn import torch.nn.functional as F class R1Distiller(nn.Module): def __init__(self, teacher, student): super().__init__() self.teacher = teacher self.student = student def forward(self, x, labels): # 教师模型前向(冻结参数) with torch.no_grad(): t_features = self.teacher.extract_features(x) # 假设已定义特征抽取方法 # 学生模型前向 s_logits, s_features = self.student(x) # 计算任务损失 loss_task = F.cross_entropy(s_logits, labels) # 计算蒸馏损失(KL散度) loss_kd = F.kl_div( F.log_softmax(s_logits / self.temperature, dim=1), F.softmax(t_logits / self.temperature, dim=1), reduction='batchmean' ) * (self.temperature ** 2) # 计算R1正则化项(以余弦相似度为例) t_features_norm = F.normalize(t_features, p=2, dim=1) s_features_norm = F.normalize(s_features, p=2, dim=1) loss_r1 = 1 - torch.mean(torch.sum(t_features_norm * s_features_norm, dim=1)) # 总损失 total_loss = loss_task + self.alpha * loss_kd + self.beta lana[鬼脸]
