1. 利用梯度下降法，计算二次函数y=x^2+x+4的最小值

def target_function(x):return x ** 2 + x +4def gradient(x):return 2*x + 1x_init = 10
x = x_init
steps = 100
lr = 0.1
for i in range(100):x = x - lr*gradient(x)print(f"最小值 f(x) = {target_function(x):.4f}")

2. 实现交叉熵损失、Softmax以及Sigmoid
在这里插入图片描述

#实现Softmax、Logsoftmax、Sigmoid以及交叉熵损失
import torch
import torch.nn.functional as Fdef softmax(x, dim=-1):exp_x = torch.exp(x)return exp_x/torch.sum(exp_x, dim=dim, keepdim=True)# 1.上溢出问题：当x趋向于无穷大时，会导致exp(x)超过数值范围
# 2.下溢出问题：当x趋向于负无穷大时，会导致exp(x)被截断变成0，加上log会出现log(0)的情况。所以要避免单独计算exp(x)
# 解决方案：1. 减掉最大值 2. 计算log时先拆开def log_softmax(x, dim=-1):x = x - torch.max(x,dim=-1,keepdim=True)[0]return x - torch.log(torch.sum(torch.exp(x),dim=-1,keepdim=True))# x = torch.rand((2,3))
# print(torch.allclose(F.softmax(x,dim=-1),softmax(x)))
# print(torch.allclose(log_softmax(x),torch.log(softmax(x))))
# print(torch.allclose(F.log_softmax(x,dim=-1),log_softmax(x)))def sigmoid(x):return 1/(1+torch.exp(-x))# print(torch.allclose(torch.sigmoid(x),sigmoid(x)))def cross_entropy_loss(y_pred, y_true):y_pred = log_softmax(y_pred,dim=-1)return -torch.sum(y_true*y_pred, dim=-1)# input = torch.rand((2,3))
# label_onehot = torch.tensor([[0,0,1],[0,1,0]])
# print(cross_entropy_loss(input,label_onehot))# # pytorch内置的cross_entropy_loss的输入是类别索引，不是one hot向量# label = torch.argmax(label_onehot,dim=-1)
# offi_cross_entropy_loss = torch.nn.CrossEntropyLoss(reduction="none")# print(torch.allclose(offi_cross_entropy_loss(input,label), cross_entropy_loss(input,label_onehot)))
# print(offi_cross_entropy_loss(input,label))

3. 实现一下CLIP和SigLIP的loss

在这里插入图片描述

import torch
import torch.nn.functional as F
import torch.nn as nnclass Clip_loss(nn.Module):def __init__(self, temp_init=1/0.07):super().__init__()self.log_temp = torch.nn.Parameter(torch.log(torch.tensor(temp_init)))def forward(self, image_features, text_features):"""image_features: shape (N, D)text_features: shape (N, D)"""image_features = F.normalize(image_features, p=2, dim=-1)text_features = F.normalize(text_features, p=2, dim=-1)temp = torch.exp(self.log_temp)logits_per_image = torch.matmul(image_features, text_features.transpose(0,1)) * templogits_per_text = logits_per_image.transpose(0,1)n = image_features.shape[0]labels = torch.arange(n, device=image_features.device)loss_i2t = F.cross_entropy(logits_per_image, labels)loss_t2i = F.cross_entropy(logits_per_text, labels)loss = (loss_i2t + loss_t2i)/2return lossclass SigLip_loss(nn.Module):def __init__(self, temp_init=1/0.07, bias_init=10.0):super().__init__()self.log_temp = torch.nn.Parameter(torch.log(torch.tensor(temp_init)))self.bias = torch.nn.Parameter(torch.tensor(bias_init))def forward(self, image_features, text_features):"""image_features: shape (N, D)text_features: shape (N, D)"""image_features = F.normalize(image_features, p=2, dim=-1)text_features = F.normalize(text_features, p=2, dim=-1)# Compute temperature and logitstemp = torch.exp(self.log_temp)logits = torch.matmul(image_features, text_features.transpose(0,1)) * temp + self.bias# Create labels: 1 for positive pairs (diagonal), -1 for negative pairsn = image_features.shape[0]labels = 2*torch.eye(n, n, device=logits.device) - torch.ones(n, n, device=logits.device)# Compute pairwise sigmoid lossloss = -torch.mean(F.logsigmoid(labels*logits))return losssiglip_loss = SigLip_loss()
clip_loss = Clip_loss()
image_features = torch.rand((10,768))
text_features = torch.rand((10,768))
siglip = siglip_loss(image_features,text_features)
clip = clip_loss(image_features,text_features)
print(siglip)
print(clip)

4. 实现Multi-head-attention

import torch
import torch.nn as nn
import torch.nn.functional as F
import math
class MHA(nn.Module):def __init__(self, n_head, n_emb, block_size, bias=True, drop_rate=0.1):super().__init__()self.n_head = n_headself.n_emb = n_embself.q_proj = nn.Linear(n_emb, n_emb, bias=bias)self.k_proj = nn.Linear(n_emb, n_emb, bias=bias)self.v_proj = nn.Linear(n_emb, n_emb, bias=bias)self.output_proj = nn.Linear(n_emb, n_emb, bias=bias)self.attn_drop = nn.Dropout(drop_rate)self.out_drop = nn.Dropout(drop_rate)self.register_buffer("mask",torch.tril(torch.ones(block_size,block_size)).view(1,1,block_size,block_size))def forward(self, x):B,T,C = x.shapeq = self.q_proj(x).view(B, T, self.n_head, C//self.n_head).transpose(1,2)k = self.k_proj(x).view(B, T, self.n_head, C//self.n_head).transpose(1,2)v = self.v_proj(x).view(B, T, self.n_head, C//self.n_head).transpose(1,2)attn = torch.matmul(q, k.transpose(-1,-2))*(1/math.sqrt(C//self.n_head))attn = attn.masked_fill(self.mask[:,:,:T,:T], float('-inf'))attn = F.softmax(attn, dim=-1)attn = self.attn_drop(attn)output = torch.matmul(attn, v).transpose(1,2).contiguous().view(B,T,C)output = self.output_proj(output)return self.attn_drop(output)input = torch.rand(10,10,768)
attention = MHA(n_head=8, n_emb=768, block_size=10)
print(attention(input).shape)