pytorch使用多GPU训练MNIST

喜欢ヅ旅行 2022-05-16 11:08 367阅读 0赞

下面的代码参数没有调试,可能准确率不高,仅仅供参考代码格式。

  1. import argparse
  2. import torch
  3. import torch.nn as nn
  4. import torch.optim as optim
  5. import torch.nn.functional as F
  6. from torchvision import datasets
  7. from torchvision import transforms
  8. from torch.autograd import Variable
  9. import torch.utils.data.distributed
  10. import math
  11. import time
  12. import os
  13. os.environ["CUDA_VISIBLE_DEVICES"]="0, 1, 2, 3, 4, 5 ,6,7"
  14. lr = 0.001
  15. batch_size = 100
  16. epochs = 10
  17. test_batch_size=100
  18. momentum=0.5
  19. log_interval=100
  20. #输入数据变换操作
  21. transform_list = [
  22. transforms.Resize(40),
  23. transforms.RandomHorizontalFlip(),
  24. transforms.RandomCrop(32),
  25. transforms.ToTensor()
  26. ]
  27. transform = transforms.Compose(transform_list)
  28. torch.manual_seed(2018)
  29. # Horovod: pin GPU to local rank.
  30. # torch.cuda.set_device(hvd.local_rank())
  31. torch.cuda.manual_seed(2018)
  32. kwargs = {
  33. 'num_workers': 4, 'pin_memory': True}
  34. train_dataset = \
  35. datasets.MNIST('data-0', train=True, download=True,
  36. transform=transforms.Compose([
  37. transforms.ToTensor(),
  38. transforms.Normalize((0.1307,), (0.3081,))
  39. ]))
  40. train_loader = torch.utils.data.DataLoader(
  41. train_dataset, batch_size=batch_size, **kwargs)
  42. test_dataset = \
  43. datasets.MNIST('data-0', train=False, transform=transforms.Compose([
  44. transforms.ToTensor(),
  45. transforms.Normalize((0.1307,), (0.3081,))
  46. ]))
  47. test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size,
  48. **kwargs)
  49. def conv3x3(in_planes, out_planes, stride=1):
  50. """3x3 convolution with padding"""
  51. return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
  52. padding=1, bias=False)
  53. # Residual Block
  54. class ResidualBlock(nn.Module):
  55. def __init__(self, in_channels, out_channels, stride=1, downsample=None):
  56. super(ResidualBlock, self).__init__()
  57. self.conv1 = conv3x3(in_channels, out_channels, stride)
  58. self.bn1 = nn.BatchNorm2d(out_channels)
  59. self.relu = nn.ReLU(inplace=True)
  60. self.conv2 = conv3x3(out_channels, out_channels)
  61. self.bn2 = nn.BatchNorm2d(out_channels)
  62. self.downsample = downsample
  63. def forward(self, x):
  64. residual = x
  65. out = self.conv1(x)
  66. out = self.bn1(out)
  67. out = self.relu(out)
  68. out = self.conv2(out)
  69. out = self.bn2(out)
  70. if self.downsample:
  71. residual = self.downsample(x)
  72. out += residual
  73. out = self.relu(out)
  74. return out
  75. # ResNet Module
  76. class ResNet(nn.Module):
  77. def __init__(self, block, layers, num_classes=10):
  78. super(ResNet, self).__init__()
  79. self.in_channels = 16
  80. self.conv = conv3x3(1, 16)
  81. self.bn = nn.BatchNorm2d(16)
  82. self.relu = nn.ReLU(inplace=True)
  83. self.layer1 = self.make_layer(block, 16, layers[0])
  84. self.layer2 = self.make_layer(block, 32, layers[1], 2)
  85. self.layer3 = self.make_layer(block, 64, layers[2], 2)
  86. self.layer4 = self.make_layer(block,128,layers[3])
  87. self.avg_pool = nn.AvgPool2d(7)
  88. self.fc = nn.Linear(128, num_classes)
  89. def make_layer(self, block, out_channels, blocks, stride=1):
  90. downsample = None
  91. if (stride != 1) or (self.in_channels != out_channels):
  92. downsample = nn.Sequential(
  93. conv3x3(self.in_channels, out_channels, stride=stride),
  94. nn.BatchNorm2d(out_channels))
  95. layers = []
  96. layers.append(block(self.in_channels, out_channels, stride, downsample))
  97. self.in_channels = out_channels
  98. for i in range(1, blocks):
  99. layers.append(block(out_channels, out_channels))
  100. return nn.Sequential(*layers)
  101. def forward(self, x):
  102. global COUNTER
  103. out = self.conv(x)
  104. out = self.bn(out)
  105. out = self.relu(out)
  106. out = self.layer1(out)
  107. out = self.layer2(out)
  108. out = self.layer3(out)
  109. out = self.layer4(out)
  110. #print("###############",out.size())
  111. out = self.avg_pool(out)
  112. out = out.view(out.size(0), -1)
  113. out = self.fc(out)
  114. return out
  115. # model = ResNet(ResidualBlock, [2, 2, 2, 2])#.cuda()
  116. model = torch.nn.DataParallel(ResNet(ResidualBlock, [2, 2, 2, 2]),device_ids=[0,1,2,3,4,5,6,7]).cuda()
  117. # Horovod: scale learning rate by the number of GPUs.
  118. optimizer = optim.SGD(model.parameters(), lr=lr,
  119. momentum=momentum)
  120. criterion = nn.CrossEntropyLoss()
  121. def train(epoch):
  122. model.train()
  123. for batch_idx, (data, target) in enumerate(train_loader):
  124. data, target = data.cuda(), target.cuda()
  125. data, target = Variable(data), Variable(target)
  126. optimizer.zero_grad()
  127. output = model(data)
  128. #loss = F.nll_loss(output, target)
  129. loss=criterion(output,target)
  130. loss.backward()
  131. optimizer.step()
  132. if batch_idx % log_interval == 0:
  133. print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
  134. epoch, batch_idx * len(data), 0,
  135. 100. * batch_idx / len(train_loader), loss.item()))
  136. for epoch in range(1, epochs + 1):
  137. train(epoch)
  138. #测试过程
  139. # model.load_state_dict(torch.load('param_model.pkl'))
  140. # test()

发表评论

表情:
评论列表 (有 0 条评论,367人围观)

还没有评论,来说两句吧...

相关阅读

    相关 PytorchGPU训练指北

    前言 在数据越来越多的时代,随着模型规模参数的增多,以及数据量的不断提升,使用多GPU去训练是不可避免的事情。Pytorch在0.4.0及以后的版本中已经提供了多GPU训