GCN、GIN

avatar
作者
筋斗云
阅读量:3
# 使用TuDataset 中的PROTEINS数据集。 # 里边有1113个蛋白质图,区分是否为酶,即二分类问题。  # 导包 from torch_geometric.datasets import TUDataset from torch_geometric.data import DataLoader import torch import torch.nn  as nn import torch.nn.functional as F from torch.nn import Linear,Sequential,BatchNorm1d,ReLU,Dropout from torch_geometric.nn import GCNConv,GINConv from torch_geometric.nn import global_mean_pool,global_add_pool  # 导入数据集 dataset = TUDataset(root='',name='PROTEINS').shuffle() # 观测图数据 print(f'Dataset:{dataset}') print(f'Number of graphs:{len(dataset)}') print(f'Number of nodes:{dataset[1].x.shape[0]}') # 这是针对于第一个图来说,每个图的节点数会不同 print(f'Number of features:{dataset.num_features}') print(f'Number of classes:{dataset.num_classes}')  # 一个大的数据集进行拆分,按照 8 :1 :1的比列分为训练集,验证集和测试集 train_dataset = dataset[:int(len(dataset)*0.8)] val_dataset = dataset[int(len(dataset)*0.8):int(len(dataset)*0.9)] test_dataset = dataset[int(len(dataset)*0.9):] # 打印验证: print('----------------------------------------------') print(f'training set  ={len(train_dataset)} graphs') # 890 print(f'validation set  ={len(val_dataset)} graphs')# 111 print(f'test set  ={len(test_dataset)} graphs')# 112 # 进行批处理,每个批次最多64个图 train_loader = DataLoader(train_dataset,batch_size=64,shuffle=True) val_loader = DataLoader(val_dataset,batch_size=64,shuffle=True) test_loader = DataLoader(test_dataset,batch_size=64,shuffle=True)  # 打印验证一下: print('------------------------------------------------') print('\nTrain Loader') for i,batch in enumerate(train_loader):     print(f'-Batch{i}:{batch}') print('\nVadidation Loader') for i,batch in enumerate(val_loader):     print(f'-Batch{i}:{batch}') print('\nTest Loader') for i,batch in enumerate(test_loader):     print(f'-Batch{i}:{batch}')  # 来咯,构建GCN模型,进行分类 class GCN(nn.Module):     def __init__(self,dim_h):         super().__init__()         self.conv1 = GCNConv(dataset.num_features,dim_h)         self.conv2 = GCNConv(dim_h,dim_h)         self.conv3 = GCNConv(dim_h,dim_h)         self.lin = Linear(dim_h,dataset.num_classes)      def forward(self,x,edge_index,batch):         h = self.conv1(x,edge_index)         h = h.relu()         h = self.conv2(h,edge_index)         h = h.relu()         h = self.conv3(h,edge_index)         # global_mean_pool 适合用于一些数据分布不平衡的数据         hG = global_mean_pool(h,batch)          # 分类         h = F.dropout(hG,p=0.5,training=self.training)         h = self.lin(h)          return F.log_softmax(h,dim=1)  # 定义GIN模型 class GIN(nn.Module):     def __init__(self,dim_h):         super().__init__()         self.conv1 = GINConv(             Sequential(                 Linear(dataset.num_features,dim_h),                 BatchNorm1d(dim_h),                 ReLU(),                 Linear(dim_h,dim_h),                 ReLU()             )         )         self.conv2 = GINConv(             Sequential(                 Linear(dim_h, dim_h),                 BatchNorm1d(dim_h),                 ReLU(),                 Linear(dim_h, dim_h),                 ReLU()             )         )         self.conv3 = GINConv(             Sequential(                 Linear(dim_h, dim_h),                 BatchNorm1d(dim_h),                 ReLU(),                 Linear(dim_h, dim_h),                 ReLU()             )         )         # 进行分类         # 看论文中的公式可知,计算后是讲三个特征concat在一起         self.lin1 = Linear(dim_h*3,dim_h*3)         self.lin2 = Linear(dim_h*3,dataset.num_classes)      def forward(self,x,edge_index,batch):         h1 = self.conv1(x,edge_index)         h2 = self.conv2(h1,edge_index)         h3 = self.conv3(h2,edge_index)         # 求和全局池化相比与其他两种池化技术(Mean global Pooling 和Max global Pooling)更具有表达能力,         # 要考虑所有的结构信息,就必须考虑GNN每一层产生的嵌入信息         # 将GNN的k个层中每层产生的节点嵌入求和后串联起来         h1 = global_add_pool(h1,batch)         h2 = global_add_pool(h2,batch)         h3 = global_add_pool(h3,batch)          h = torch.cat((h1,h2,h3),dim=1)          # 分类         h = self.lin1(h)         h = h.relu()         h = F.dropout(h,p=0.5,training=self.training)         h = self.lin2(h)         return F.log_softmax(h,dim=1)  # 开始训练咯 def train(model,loader):     # 设置为训练模式     model.train()     # 损失函数     criterion = nn.CrossEntropyLoss()     # 优化函数     optimizer = torch.optim.Adam(model.parameters(),lr=0.01)     epochs = 100     for epoch in range(epochs+1):         total_loss = 0         acc = 0         val_loss = 0         val_acc = 0         for data in loader:             # 梯度清零             optimizer.zero_grad()             # 训练             out = model(data.x,data.edge_index,data.batch)             # 计算该批次的损失值             loss = criterion(out,data.y)             # 总损失             total_loss += loss / len(loader)             # 计算该批次的准确率             acc = accuracy(out.argmax(dim=1),data.y) / len(loader)             # 反向传播             loss.backward()             # 参数更细             optimizer.step()              # 验证             val_loss,val_acc = test(model,val_loader)             # Print metrics every 20 epochs         if (epoch % 20 == 0):             print(f'Epoch {epoch:>3} | Train Loss: {total_loss:.2f} | Train Acc: {acc * 100:>5.2f}% | Val Loss: {val_loss:.2f} | Val Acc: {val_acc * 100:.2f}%')     return model   def accuracy(pred_y,y):     return ((pred_y == y).sum() / len(y)).item()  def test(model,loader):     criterion = torch.nn.CrossEntropyLoss()     model.eval()     loss = 0     acc = 0     for data in loader:         out = model(data.x,data.edge_index,data.batch)         loss += criterion(out,data.y) / len(loader)         acc += accuracy(out.argmax(dim=1),data.y) / len(loader)     return loss,acc  # 开始训练 print('GCN Training') gcn = GCN(dim_h=32) gcn = train(gcn,train_loader) print('GIN Training') gin = GIN(dim_h=32) gin = train(gin,train_loader)  test_loss, test_acc = test(gcn, test_loader) print(f'GCN test Loss: {test_loss:.2f} | GCN test Acc: {test_acc*100:.2f}%')  test_loss, test_acc = test(gin, test_loader) print(f'Gin test Loss: {test_loss:.2f} | Gin test Acc: {test_acc*100:.2f}%')  

GCN 思想:
通过卷积操作来聚合每个节点以及其邻居的特征。
计算公式如下:
H l + 1 = σ ( D ~ − 1 / 2 A ~ D ~ − 1 / 2 H l W l ) H^{l+1}=\sigma(\tilde{D}^{-1/2}\tilde{A}\tilde{D}^{-1/2}H^{l}W^{l}) Hl+1=σ(D~1/2A~D~1/2HlWl)
GIN 思想:
目的:增强图神经网络的区分能力,能够更好地区分不同的图,引入了更加强大的聚合函数。
计算公式如下:
h v k = M L P k ( ( 1 + ε ) ⋅ h v k − 1 + ∑ u ∈ N ( v ) h u k − 1 ) h_{v}^{k}=MLP^{k}((1+\varepsilon)\cdot h_{v}^{k-1} + \sum_{u\in\mathcal{N}_(v)}h_{u}^{k-1} ) hvk=MLPk((1+ε)hvk1+uN(v)huk1)
ε \varepsilon ε 是一个可学习的或固定的超参数,用于调节自环的贡献。

广告一刻

为您即时展示最新活动产品广告消息,让您随时掌握产品活动新动态!