Ch 12. 딥러닝 입문 (분류)

Part.3 Binary Classification

Load Dataset from sklearn

In [1] :

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

In [2] :

from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

# 유방암 데이터 테이스

In [3] :

df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['class'] = cancer.target

df.tail()

Out [3] :

mean radius	mean texture	mean perimeter	mean area	mean smoothness	mean compactness	mean concavity	mean concave points	mean symmetry	mean fractal dimension	…	worst texture	worst perimeter	worst area	worst smoothness	worst compactness	worst concavity	worst concave points	worst symmetry	worst fractal dimension	class
564	21.56	22.39	142.00	1479.0	0.11100	0.11590	0.24390	0.13890	0.1726	0.05623	…	26.40	166.10	2027.0	0.14100	0.21130	0.4107	0.2216	0.2060	0.07115	0
565	20.13	28.25	131.20	1261.0	0.09780	0.10340	0.14400	0.09791	0.1752	0.05533	…	38.25	155.00	1731.0	0.11660	0.19220	0.3215	0.1628	0.2572	0.06637	0
566	16.60	28.08	108.30	858.1	0.08455	0.10230	0.09251	0.05302	0.1590	0.05648	…	34.12	126.70	1124.0	0.11390	0.30940	0.3403	0.1418	0.2218	0.07820	0
567	20.60	29.33	140.10	1265.0	0.11780	0.27700	0.35140	0.15200	0.2397	0.07016	…	39.42	184.60	1821.0	0.16500	0.86810	0.9387	0.2650	0.4087	0.12400	0
568	7.76	24.54	47.92	181.0	0.05263	0.04362	0.00000	0.00000	0.1587	0.05884	…	30.37	59.16	268.6	0.08996	0.06444	0.0000	0.0000	0.2871	0.07039	1

5 rows × 31 columns

In [4] :

df.describe()

Out [4] :

mean radius	mean texture	mean perimeter	mean area	mean smoothness	mean compactness	mean concavity	mean concave points	mean symmetry	mean fractal dimension	…	worst texture	worst perimeter	worst area	worst smoothness	worst compactness	worst concavity	worst concave points	worst symmetry	worst fractal dimension	class
count	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	…	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000
mean	14.127292	19.289649	91.969033	654.889104	0.096360	0.104341	0.088799	0.048919	0.181162	0.062798	…	25.677223	107.261213	880.583128	0.132369	0.254265	0.272188	0.114606	0.290076	0.083946	0.627417
std	3.524049	4.301036	24.298981	351.914129	0.014064	0.052813	0.079720	0.038803	0.027414	0.007060	…	6.146258	33.602542	569.356993	0.022832	0.157336	0.208624	0.065732	0.061867	0.018061	0.483918
min	6.981000	9.710000	43.790000	143.500000	0.052630	0.019380	0.000000	0.000000	0.106000	0.049960	…	12.020000	50.410000	185.200000	0.071170	0.027290	0.000000	0.000000	0.156500	0.055040	0.000000
25%	11.700000	16.170000	75.170000	420.300000	0.086370	0.064920	0.029560	0.020310	0.161900	0.057700	…	21.080000	84.110000	515.300000	0.116600	0.147200	0.114500	0.064930	0.250400	0.071460	0.000000
50%	13.370000	18.840000	86.240000	551.100000	0.095870	0.092630	0.061540	0.033500	0.179200	0.061540	…	25.410000	97.660000	686.500000	0.131300	0.211900	0.226700	0.099930	0.282200	0.080040	1.000000
75%	15.780000	21.800000	104.100000	782.700000	0.105300	0.130400	0.130700	0.074000	0.195700	0.066120	…	29.720000	125.400000	1084.000000	0.146000	0.339100	0.382900	0.161400	0.317900	0.092080	1.000000
max	28.110000	39.280000	188.500000	2501.000000	0.163400	0.345400	0.426800	0.201200	0.304000	0.097440	…	49.540000	251.200000	4254.000000	0.222600	1.058000	1.252000	0.291000	0.663800	0.207500	1.000000

Convert to PyTorch Tensor

In [5] :

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [6] :

data = torch.from_numpy(df.values).float()

data.shape

Out [6] :

torch.Size([569, 31])

In [7] :

x = data[:, :-1]
y = data[:, -1:]

print(x.shape, y.shape)

torch.Size([569, 30]) torch.Size([569, 1])

In [8] :

Train / Valid / Test ratio

ratios = [.6, .2, .2]

In [9] :

train_cnt = int(data.size(0) * ratios[0])
valid_cnt = int(data.size(0) * ratios[1])
test_cnt = data.size(0) - train_cnt - valid_cnt
cnts = [train_cnt, valid_cnt, test_cnt]

print("Train %d / Valid %d / Test %d samples." % (train_cnt, valid_cnt, test_cnt))

Train 341 / Valid 113 / Test 115 samples.

In [10] :

indices = torch.randperm(data.size(0))

x = torch.index_select(x, dim=0, index=indices)
y = torch.index_select(y, dim=0, index=indices)

x = x.split(cnts, dim=0)
y = y.split(cnts, dim=0)

for x_i, y_i in zip(x, y):
    print(x_i.size(), y_i.size())

torch.Size([341, 30]) torch.Size([341, 1])
torch.Size([113, 30]) torch.Size([113, 1])
torch.Size([115, 30]) torch.Size([115, 1])

Preprocessing

In [11] :

scaler = StandardScaler()
scaler.fit(x[0].numpy())

x =
[torch.from_numpy(scaler.transform(x[0].numpy())).float(),    torch.from_numpy(scaler.transform(x[1].numpy())).float(),
torch.from_numpy(scaler.transform(x[2].numpy())).float()]

df = pd.DataFrame(x[0].numpy(), columns=cancer.feature_names)
df.tail()

Out [11] :

mean radius	mean texture	mean perimeter	mean area	mean smoothness	mean compactness	mean concavity	mean concave points	mean symmetry	mean fractal dimension	…	worst radius	worst texture	worst perimeter	worst area	worst smoothness	worst compactness	worst concavity	worst concave points	worst symmetry	worst fractal dimension
336	2.109400	-0.982811	2.074373	2.141978	0.185909	0.834008	1.251482	1.990152	0.556192	-0.173216	…	1.375889	-0.629586	1.369703	1.215443	-0.684062	0.122503	0.548355	1.215406	-0.293150	0.080699
337	0.301591	-0.605077	0.296273	0.197547	-0.622822	-0.197263	-0.192340	-0.226158	-0.799453	-0.451247	…	0.330800	-0.888033	0.324745	0.218299	0.660655	0.418800	0.228829	0.124483	0.779932	0.641780
338	-1.547870	-0.808289	-1.499389	-1.213282	1.248526	-0.454432	-0.611624	-0.789981	-0.602723	1.873202	…	-1.398695	-0.988181	-1.291901	-1.070585	1.708544	-0.164742	-0.170464	-0.184153	0.013892	0.866421
339	-1.253992	-0.846540	-1.262700	-1.041903	-0.508002	-1.105605	-0.983191	-0.925247	0.942498	0.008845	…	-1.160364	-0.784654	-1.182241	-0.941166	0.552717	-1.025874	-1.111026	-1.000206	-0.119264	-0.175809
340	-0.265338	0.317742	-0.319287	-0.319205	-1.721811	-0.996952	-0.897690	-0.964754	-0.373802	-0.589556	…	-0.363802	0.439739	-0.355996	-0.405396	-1.016868	-0.598023	-0.943469	-1.147496	-0.360512	-0.565535

5 rows × 30 columns

Build Model & Optimizer

In [12] :

# 입력 벡터 사이즈가 30차원

model = nn.Sequential(
    nn.Linear(x[0].size(-1), 25), # 30~25차원
    nn.LeakyReLU(),
    nn.Linear(25, 20), # 25~20차원
    nn.LeakyReLU(),
    nn.Linear(20, 15), # 20~15차원
    nn.LeakyReLU(),
    nn.Linear(15, 10), # 15~10차원
    nn.LeakyReLU(),
    nn.Linear(10, 5), # 10~5차원
    nn.LeakyReLU(),
    nn.Linear(5, y[0].size(-1)),
    nn.Sigmoid(),
)

model

Out [12] :

Sequential(
  (0): Linear(in_features=30, out_features=25, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=25, out_features=20, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=20, out_features=15, bias=True)
  (5): LeakyReLU(negative_slope=0.01)
  (6): Linear(in_features=15, out_features=10, bias=True)
  (7): LeakyReLU(negative_slope=0.01)
  (8): Linear(in_features=10, out_features=5, bias=True)
  (9): LeakyReLU(negative_slope=0.01)
  (10): Linear(in_features=5, out_features=1, bias=True)
  (11): Sigmoid()
)

In [13] :

optimizer = optim.Adam(model.parameters())

Train

In [14] :

n_epochs = 10000
batch_size = 32
print_interval = 100
early_stop = 1000

In [15] :

from copy import deepcopy

lowest_loss = np.inf
best_model = None

lowest_epoch = np.inf

In [16] :

train_history, valid_history = [], []

for i in range(n_epochs):
    indices = torch.randperm(x[0].size(0))
    x_ = torch.index_select(x[0], dim=0, index=indices) 
    y_ = torch.index_select(y[0], dim=0, index=indices)
    

    x_ = x_.split(batch_size, dim=0) # |xi| = (32,30)
    y_ = y_.split(batch_size, dim=0) # |yi| = (32,1)
    
    train_loss, valid_loss = 0, 0
    y_hat = []
    
    for x_i, y_i in zip(x_, y_):
        y_hat_i = model(x_i)
        loss = F.binary_cross_entropy(y_hat_i, y_i)
    
        optimizer.zero_grad()
        loss.backward()
    
        optimizer.step()        
        train_loss += float(loss) # This is very important to prevent memory leak.
    
    train_loss = train_loss / len(x_)
        
    with torch.no_grad():
        x_ = x[1].split(batch_size, dim=0)
        y_ = y[1].split(batch_size, dim=0)
        
        valid_loss = 0
        
        for x_i, y_i in zip(x_, y_):
            y_hat_i = model(x_i)
            loss = F.binary_cross_entropy(y_hat_i, y_i)
            
            valid_loss += float(loss)
            
            y_hat += [y_hat_i]
            
    valid_loss = valid_loss / len(x_)
    
    train_history += [train_loss]
    valid_history += [valid_loss]
        
    if (i + 1) % print_interval == 0:
        print('Epoch %d: train loss=%.4e  valid_loss=%.4e  lowest_loss=%.4e' % (
            i + 1,
            train_loss,
            valid_loss,
            lowest_loss,
        ))
        
    if valid_loss <= lowest_loss:
        lowest_loss = valid_loss
        lowest_epoch = i
        
        best_model = deepcopy(model.state_dict())
    else:
        if early_stop > 0 and lowest_epoch + early_stop < i + 1:
            print("There is no improvement during last %d epochs." % early_stop)
            break

print("The best validation loss from epoch %d: %.4e" % (lowest_epoch + 1, lowest_loss))
model.load_state_dict(best_model)

Epoch 100: train loss=5.6986e-03  valid_loss=2.1232e-01  lowest_loss=8.8456e-02
Epoch 200: train loss=1.7982e-05  valid_loss=3.7934e-01  lowest_loss=8.8456e-02
Epoch 300: train loss=4.0965e-06  valid_loss=4.2597e-01  lowest_loss=8.8456e-02
Epoch 400: train loss=1.6596e-06  valid_loss=1.1125e+00  lowest_loss=8.8456e-02
Epoch 500: train loss=7.0210e-07  valid_loss=1.1312e+00  lowest_loss=8.8456e-02
Epoch 600: train loss=3.9337e-07  valid_loss=1.1483e+00  lowest_loss=8.8456e-02
Epoch 700: train loss=1.7483e-07  valid_loss=1.1647e+00  lowest_loss=8.8456e-02
Epoch 800: train loss=9.1729e-08  valid_loss=1.1804e+00  lowest_loss=8.8456e-02
Epoch 900: train loss=4.5905e-08  valid_loss=1.1962e+00  lowest_loss=8.8456e-02
Epoch 1000: train loss=2.3287e-08  valid_loss=1.2109e+00  lowest_loss=8.8456e-02
There is no improvement during last 1000 epochs.
The best validation loss from epoch 12: 8.8456e-02

Out [16] :

<All keys matched successfully>

Loss History

In [17] :

plot_from = 2

plt.figure(figsize=(20, 10))
plt.grid(True)
plt.title("Train / Valid Loss History")
plt.plot(
    range(plot_from, len(train_history)), train_history[plot_from:],
    range(plot_from, len(valid_history)), valid_history[plot_from:],
)
plt.yscale('log')
plt.show()

20210731_154719

결과값 도출

In [18] :

test_loss = 0
y_hat = []

with torch.no_grad():
    x_ = x[2].split(batch_size, dim=0)
    y_ = y[2].split(batch_size, dim=0)

    for x_i, y_i in zip(x_, y_):
        y_hat_i = model(x_i)
        loss = F.binary_cross_entropy(y_hat_i, y_i)
    
        test_loss += loss # Gradient is already detached.
    
        y_hat += [y_hat_i]

test_loss = test_loss / len(x_)
y_hat = torch.cat(y_hat, dim=0)

print("Test loss: %.4e" % test_loss)

Test loss: 7.3890e-02

In [19] :

correct_cnt = (y[2] == (y_hat > .5)).sum()
total_cnt = float(y[2].size(0))

print('Test Accuracy: %.4f' % (correct_cnt / total_cnt))

Test Accuracy: 0.9913

In [20] :

df = pd.DataFrame(torch.cat([y[2], y_hat], dim=1).detach().numpy(),
                  columns=["y", "y_hat"])

sns.histplot(df, x='y_hat', hue='y', bins=50, stat='probability')
plt.show()

20210731_160248

In [21] :

from sklearn.metrics import roc_auc_score

roc_auc_score(df.values[:, 0], df.values[:, 1])

Out [21] :

0.9951550387596899

Deep learning Studying(54) - Deep Binary Classification 실습

Part.3 Binary Classification 실습

Ch 12. 딥러닝 입문 (분류)

Part.3 Binary Classification

Load Dataset from sklearn

Convert to PyTorch Tensor

Preprocessing

Build Model & Optimizer

Train

Loss History

결과값 도출