인공지능/공부

인공지능 MNIST - CNN pytorch validation, 정규화

이게될까 2023. 12. 2. 20:51
728x90
728x90
import torch
import torch.nn as nn
import torch.nn.functional as f
import torch.optim as optim
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torchvision.datasets as dsets
import torchvision.transforms as transforms
import time
device= "cuda" if torch.cuda.is_available() else "cpu"
print(device)
trainset = dsets.MNIST(root='dataset/',
                       train=True,
                       transform= transforms.ToTensor(),
                       download = True)
testset = dsets.MNIST(root='dataset/',
                       train=False,
                       transform= transforms.ToTensor(),
                       download = True)
trainset.data = trainset.data/255
testset.data = testset.data/255
relu = nn.ReLU()
model=nn.Sequential(nn.Conv2d(in_channels=1,out_channels=6,kernel_size = 5,stride=1, padding=2,dilation=1,groups=1,bias=True),
                    nn.BatchNorm2d(6),
                    relu,
                    nn.MaxPool2d(kernel_size = 2,stride = 2,padding = 0, dilation = 1, ceil_mode= False),
                    nn.Conv2d(in_channels=6,out_channels=20,kernel_size = 5,stride=1, padding=2,dilation=1,groups=1,bias=True),
                    nn.BatchNorm2d(20),
                    relu,
                    nn.Conv2d(in_channels=20,out_channels=20,kernel_size = 5,stride=1, padding=2,dilation=1,groups=1,bias=True),
                    nn.BatchNorm2d(20),
                    relu,
                    nn.Conv2d(in_channels=20,out_channels=20,kernel_size = 5,stride=1, padding=2,dilation=1,groups=1,bias=True),
                    nn.BatchNorm2d(20),
                    relu,
                    nn.MaxPool2d(kernel_size = 2,stride = 2,padding = 0, dilation = 1, ceil_mode= False),

                    nn.Conv2d(in_channels=20,out_channels=40,kernel_size = 5,stride=1, padding=2,dilation=1,groups=1,bias=True),
                    nn.BatchNorm2d(40),
                    relu,
                    nn.Conv2d(in_channels=40,out_channels=40,kernel_size = 5,stride=1, padding=2,dilation=1,groups=1,bias=True),
                    nn.BatchNorm2d(40),
                    relu,
                    nn.Conv2d(in_channels=40,out_channels=40,kernel_size = 5,stride=1, padding=2,dilation=1,groups=1,bias=True),
                    nn.BatchNorm2d(40),
                    relu,
                    nn.MaxPool2d(kernel_size = 2,stride = 2,padding = 1, dilation = 1, ceil_mode= False),

                    nn.Conv2d(in_channels=40,out_channels=80,kernel_size = 3,stride=1, padding=1,dilation=1,groups=1,bias=True),
                    nn.BatchNorm2d(80),
                    relu,
                    nn.Conv2d(in_channels=80,out_channels=80,kernel_size = 3,stride=1, padding=1,dilation=1,groups=1,bias=True),
                    nn.BatchNorm2d(80),
                    relu,
                    nn.Conv2d(in_channels=80,out_channels=80,kernel_size = 3,stride=1, padding=1,dilation=1,groups=1,bias=True),
                    nn.BatchNorm2d(80),
                    relu,
                    nn.MaxPool2d(kernel_size = 2,stride = 2,padding = 0, dilation = 1, ceil_mode= False),

                    nn.Conv2d(in_channels=80,out_channels=160,kernel_size = 3,stride=1, padding=1,dilation=1,groups=1,bias=True),
                    nn.BatchNorm2d(160),
                    relu,
                    nn.Conv2d(in_channels=160,out_channels=160,kernel_size = 3,stride=1, padding=1,dilation=1,groups=1,bias=True),
                    nn.BatchNorm2d(160),
                    relu,
                    nn.Conv2d(in_channels=160,out_channels=160,kernel_size = 3,stride=1, padding=1,dilation=1,groups=1,bias=True),
                    nn.AdaptiveAvgPool2d((1,1)),
                    nn.Flatten(),
                    nn.Linear(160,80,bias =True),
                    nn.BatchNorm1d(80),
                    relu,
                    nn.Dropout(0.3),
                    nn.Linear(80,40,bias =True),
                    nn.BatchNorm1d(40),
                    relu,
                    nn.Dropout(0.3),
                    nn.Linear(40,10,bias =True),
                    nn.Softmax(dim = 1)
                     ).to(device)
lr = 0.0001
epochs = 600
bathsize = 4000
dropout = 0.1
dataset=TensorDataset(trainset.data[6000:,:,:],trainset.targets[6000:])
val_loader = TensorDataset(trainset.data[:6000,:,:],trainset.targets[:6000])
dataloader = DataLoader(dataset, batch_size= bathsize,shuffle=True)
optimizer = optim.Adam(model.parameters(),lr=lr,betas=(0.9,0.999), weight_decay=0.001)
best_val_loss = float('inf')
loss_graph=[]
val_loss_graph=[]
start= time.time()
for k in range (epochs +1):
  model.train()  # 훈련 모드 설정
  for i ,sample in enumerate(dataloader) :
    (x,y) = sample
    optimizer.zero_grad()
    cost = f.cross_entropy(model(torch.unsqueeze(x,1).float().to(device)),y.to(device)).to(device)
    cost.backward()
    optimizer.step()
  if k%5 == 0:
    print(k, cost.item())
  loss_graph.append(cost.item())
  model.eval()  # 평가 모드 설정
  total_val_loss = 0
  with torch.no_grad():
    loss =  f.cross_entropy(model(torch.unsqueeze(trainset.data[:6000,:,:],1).float().to(device)),trainset.targets[:6000].to(device)).to(device)
    total_val_loss += loss.item()

  avg_val_loss = total_val_loss
  #print(f'Epoch {k}, Validation Loss: {avg_val_loss}')
  val_loss_graph.append(avg_val_loss)
    # 최적의 모델 저장
  if avg_val_loss < best_val_loss:
      best_val_loss = avg_val_loss
      torch.save(model.state_dict(), 'best_model.pth')
      print(f'upload complete : best_loss: {best_val_loss}')
print("time : {} sec".format(time.time()-start))
plt.figure()
plt.plot(loss_graph)
plt.plot(val_loss_graph)
plt.xlabel('Number of iterations')
plt.ylabel('loss_graph')
print((torch.argmax(model(torch.unsqueeze(testset.data,1).data.float().to(device)),dim=1).float()==testset.targets.to(device)).float().mean())
print((torch.argmax(model(torch.unsqueeze(trainset.data[6000:16000,:,:] ,1).float().to(device)),dim=1).float()==trainset.targets[6000:16000].to(device)).float().mean())
model.load_state_dict(torch.load('best_model.pth'))
print((torch.argmax(model(torch.unsqueeze(testset.data,1).data.float().to(device)),dim=1).float()==testset.targets.to(device)).float().mean())
print((torch.argmax(model(torch.unsqueeze(trainset.data[6000:16000,:,:] ,1).float().to(device)),dim=1).float()==trainset.targets[6000:16000].to(device)).float().mean())

전이 학습 안시키고 내가 만든 네트워크에서 MNIST를 한 것 치고 최대의 효율 ! 99.4점 나왔다.

네트워크 설명 특징 시간(s) 점수
단순 FCN에 에폭 20 레이어 4 낮은 학습률 57.8 62
에폭만 80으로 증가 학습 진전 X 220 72
레이어 2개 추가 학습 진전 X 247 75
ReLU추가, 에폭 40으로 감소, 배치 사이즈 증가 학습 점수 99에 비해 낮은 점수 30 97
BatchNorm1d, dropout 추가 학습 완료 X 32 97
에폭 증가 학습 완료 X 46 97
에폭 증가, regularization 추가 학습 완료 X 77 97
Regularization, dropout 감소 Train 점수 99.7 80 97
에폭 증가, Regularization, dropout 증가, 데이터 스케일링 Train 점수 99.6 174 97.7
에폭 증가, Regularization 증가 Train 점수 99.5 293 97.4
Regularization 증가                     Train 점수 99.5 302 97.5
Regularization 증가         언더피팅 304 78.7
밑에는 CNN      
단순 레이어 3CNN(3, stride = 1, padding = 0, dilation = 1) 후 레이어 1FCN, 에폭 20, 스케일링 그대로 언더피팅 45 20
에폭 증가, relu추가 언더피팅 83 17
Padding 추가, cnn 1layer추가, FCN 1 layer 추가 언더피팅 263 25.6
maxpooling추가 언더피팅 56 59.2
cnn 1 layer추가, batch size 감소 언더피팅 72 67.1
에폭 증가, cnn 1 layer 증가 언더피팅 269 66.2
커널 사이즈 증가, BatchNorm1d 추가 Train 점수 99.2 274 98.9
커널 사이즈 증가, 에폭 추가, weight_decay추가 Train 점수 99.2 473 98.9
에폭 증가 Train 점수 99.7 624 99.1
CNNBatchNorm2d 추가, FCNdrop out추가 Train 점수 99.9 648 99
Validation 추가, validation이 좋은 데이터로 평가, 에폭 증가, weight_decay 증가 언더피팅 769 60
weight_decay 감소 Train 99.99 767 99.1
에폭 증가(colab GPU 사용 만료로 kaggle로 이전) Train 99.97 457 99.2
CNN반복 추가, 에폭 증가 Train 99.99 778 99.4
CNN반복 추가, 에폭 증가 Train 99.98 1064 99.3
에폭 증가 Train 99.98 1597 99.4
CNN, FCN 레이어 하나씩 추가 Train 99.99 1802
99.2

네트워크의 변경 과정은 이 표를 보고 알 수 있다. 확실히 CNN이 고점이 높다.

728x90