Way2AI · 神经网络 (二)

TL;DR

接上篇，本文使用PyTorch实现一个相同的神经网络模型。

Model

我们将使用两个线性连接层，并在前向传播中添加ReLU激活函数。

class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, x_in):
        z = F.relu(self.fc1(x_in))
        z = self.fc2(z)
        return z

# Initalize model
model = MLP(input_dim=INPUT_DIM, hidden_dim=HIDDEN_DIM, num_classes=NUM_CLASSES)
print(model.named_parameters)

# Output
# <bound method Module.named_parameters of MLP(
#   (fc1): Linear(in_features=2, out_features=100, bias=True)
#   (fc2): Linear(in_features=100, out_features=3, bias=True)
# )>

Training

训练模型的代码跟之前学到的逻辑回归几乎没有区别。

LEARNING_RATE = 1e-2
NUM_EPOCHS = 10
BATCH_SIZE = 32

# Define Loss
class_weights_tensor = torch.Tensor(list(class_weights.values()))
loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)


# Accuracy
def accuracy_fn(y_pred, y_true):
    n_correct = torch.eq(y_pred, y_true).sum().item()
    accuarcy = (n_correct / len(y_pred)) * 100
    return accuarcy

# Optimizer
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)

# Convert data to tensors
X_train = torch.Tensor(X_train)
y_train = torch.LongTensor(y_train)
X_val = torch.Tensor(X_val)
y_val = torch.LongTensor(y_val)
X_test = torch.Tensor(X_test)
y_test = torch.LongTensor(y_test)

# Training
for epoch in range(NUM_EPOCHS * 10):
    # Forward pass
    y_pred = model(X_train)

    # Loss
    loss = loss_fn(y_pred, y_train)

    # Zero all gradients
    optimizer.zero_grad()

    # Backward pass
    loss.backward()

    # Update weights
    optimizer.step()

    if epoch % 10 == 0:
        predictions = y_pred.max(dim=1)[1]  # class
        accuracy = accuracy_fn(y_pred=predictions, y_true=y_train)
        print (f"Epoch: {epoch} | loss: {loss:.2f}, accuracy: {accuracy:.1f}")

# Output
# Epoch: 0 | loss: 0.09, accuracy: 98.6
# Epoch: 10 | loss: 0.06, accuracy: 99.0
# Epoch: 20 | loss: 0.05, accuracy: 99.2
# Epoch: 30 | loss: 0.04, accuracy: 99.6
# Epoch: 40 | loss: 0.03, accuracy: 99.7
# Epoch: 50 | loss: 0.03, accuracy: 99.7
# Epoch: 60 | loss: 0.03, accuracy: 99.7
# Epoch: 70 | loss: 0.02, accuracy: 99.7
# Epoch: 80 | loss: 0.02, accuracy: 99.7
# Epoch: 90 | loss: 0.02, accuracy: 99.7

Evaluation

# Predictiions
y_prob = F.softmax(model(X_test), dim=1)
y_pred = y_prob.max(dim=1)[1]

# Performance
performance = get_metrics(y_true=y_test, y_pred=y_pred, classes=classes)
print (json.dumps(performance, indent=2))

# Output
# {
#   "overall": {
#     "precision": 1.0,
#     "recall": 1.0,
#     "f1": 1.0,
#     "num_samples": 225.0
#   },
#   "class": {
#     "c1": {
#       "precision": 1.0,
#       "recall": 1.0,
#       "f1": 1.0,
#       "num_samples": 75.0
#     },
#     "c2": {
#       "precision": 1.0,
#       "recall": 1.0,
#       "f1": 1.0,
#       "num_samples": 75.0
#     },
#     "c3": {
#       "precision": 1.0,
#       "recall": 1.0,
#       "f1": 1.0,
#       "num_samples": 75.0
#     }
#   }
# }

# Visualize the decision boundary
plt.figure(figsize=(12,5))
plt.subplot(1, 2, 1)
plt.title("Train")
plot_multiclass_decision_boundary(model=model, X=X_train, y=y_train)
plt.subplot(1, 2, 2)
plt.title("Test")
plot_multiclass_decision_boundary(model=model, X=X_test, y=y_test)
plt.show()

如你所见，PyTorch的直观和易用性能让我的学习曲线相对平缓。

需要我们编写的核心代码，只集中在定义模型、定义损失函数和优化器、定义训练循环、验证和测试这个四个部分。

当然，还有许多细节需要考虑，比如说数据预处理、模型的保存和加载、使用GPU等。

Initializing weights

到目前为止，我们都是使用了一个很小的随机值初始化权重，这其实不是让模型在训练阶段能够收敛的最佳方式。

我们的目标是初始化一个合适的权重，使得我们激活的输出不会消失或者爆炸，因为这两种情况都会阻碍模型收敛。事实上我们可以自定义权重初始化方法。目前比较常用的是Xavier初始化方法和He初始化方法。

事实上PyTorch的Linear类默认使用了kaiming_uniform_初始化方法，相关源代码看这里，后续我们会学习到更高级的优化收敛的策略如batch normalization。

from torch.nn import init

class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def init_weights(self):
        init.xavier_normal_(self.fc1.weight, gain=init.calculate_gain("relu"))

    def forward(self, x_in):
        z = F.relu(self.fc1(x_in))
        z = self.fc2(z)
        return z

Dropout

能够让我们的模型表现的好的最好的技术是增加数据，但这并不总是一个可选项。幸运的是，还有有一些帮助模型更健壮的其他办法，如正则化、dropout等。

Dropout是在训练过程中允许我们将神经元的输出置0的技术。由于我们每批次都会丢弃一组不同的神经元，所以Dropout可以作为一种采样策略，防止过拟合。

DROPOUT_P = 0.1 # percentage of weights that are dropped each pass

class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, dropout_p, num_classes):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout_p) # dropout
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def init_weights(self):
        init.xavier_normal(self.fc1.weight, gain=init.calculate_gain("relu"))

    def forward(self, x_in):
        z = F.relu(self.fc1(x_in))
        z = self.dropout(z) # dropout
        z = self.fc2(z)
        return z

# Initialize model
model = MLP(input_dim=INPUT_DIM, hidden_dim=HIDDEN_DIM,
            dropout_p=DROPOUT_P, num_classes=NUM_CLASSES)
print (model.named_parameters)

# Output
# <bound method Module.named_parameters of MLP(
#   (fc1): Linear(in_features=2, out_features=100, bias=True)
#   (dropout): Dropout(p=0.1, inplace=False)
#   (fc2): Linear(in_features=100, out_features=3, bias=True)
# )>

Overfitting

虽然神经网络很擅长捕捉非线性关系，但它们非常容易对训练数据进行过度拟合，且无法对测试数据进行归纳。

看看下面的例子，我们使用完全随机的数据，并试图拟合含 $2 * N * C + D $ (其中N=样本数，C=标签，D表示输入纬度) 隐藏神经元的模型。

NUM_EPOCHS = 500
NUM_SAMPLES_PER_CLASS = 50
LEARNING_RATE = 1e-1
HIDDEN_DIM = 2 * NUM_SAMPLES_PER_CLASS * NUM_CLASSES + INPUT_DIM # 2*N*C + D

# Generate random data
X = np.random.rand(NUM_SAMPLES_PER_CLASS * NUM_CLASSES, INPUT_DIM)
y = np.array([[i] * NUM_SAMPLES_PER_CLASS for i in range(NUM_CLASSES)]).reshape(-1)
print ("X: ", format(np.shape(X)))
print ("y: ", format(np.shape(y)))

# Output
# X:  (150, 2)
# y:  (150,)

# Create data splits
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(
    X=X, y=y, train_size=TRAIN_SIZE)
print (f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print (f"X_val: {X_val.shape}, y_val: {y_val.shape}")
print (f"X_test: {X_test.shape}, y_test: {y_test.shape}")
print (f"Sample point: {X_train[0]} → {y_train[0]}")

# Output
# X_train: (105, 2), y_train: (105,)
# X_val: (23, 2), y_val: (23,)
# X_test: (22, 2), y_test: (22,)
# Sample point: [0.51102894 0.55377194] → 2


# Standardize the inputs (mean=0, std=1) using training data
X_scaler = StandardScaler().fit(X_train)
X_train = X_scaler.transform(X_train)
X_val = X_scaler.transform(X_val)
X_test = X_scaler.transform(X_test)

# Convert data to tensors
X_train = torch.Tensor(X_train)
y_train = torch.LongTensor(y_train)
X_val = torch.Tensor(X_val)
y_val = torch.LongTensor(y_val)
X_test = torch.Tensor(X_test)
y_test = torch.LongTensor(y_test)

# Initialize model
model = MLP(input_dim=INPUT_DIM, hidden_dim=HIDDEN_DIM,
            dropout_p=DROPOUT_P, num_classes=NUM_CLASSES)
print (model.named_parameters)


# Output
# <bound method Module.named_parameters of MLP(
#   (fc1): Linear(in_features=2, out_features=302, bias=True)
#   (dropout): Dropout(p=0.1, inplace=False)
#   (fc2): Linear(in_features=302, out_features=3, bias=True)
# )>

# Optimizer
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)

# Training
for epoch in range(NUM_EPOCHS):
    # Forward pass
    y_pred = model(X_train)

    # Loss
    loss = loss_fn(y_pred, y_train)

    # Zero all gradients
    optimizer.zero_grad()

    # Backward pass
    loss.backward()

    # Update weights
    optimizer.step()

    if epoch%50==0:
        predictions = y_pred.max(dim=1)[1] # class
        accuracy = accuracy_fn(y_pred=predictions, y_true=y_train)
        print (f"Epoch: {epoch} | loss: {loss:.2f}, accuracy: {accuracy:.1f}")
# Output
# Epoch: 0 | loss: 1.07, accuracy: 43.8
# Epoch: 20 | loss: 0.94, accuracy: 52.4
# Epoch: 40 | loss: 0.89, accuracy: 55.2
# Epoch: 60 | loss: 0.87, accuracy: 49.5
# Epoch: 80 | loss: 0.82, accuracy: 63.8
# Epoch: 100 | loss: 0.84, accuracy: 62.9
# Epoch: 120 | loss: 0.75, accuracy: 63.8
# Epoch: 140 | loss: 0.77, accuracy: 60.0
# Epoch: 160 | loss: 0.75, accuracy: 60.0
# Epoch: 180 | loss: 0.75, accuracy: 66.7
# Epoch: 200 | loss: 0.75, accuracy: 67.6
# Epoch: 220 | loss: 0.69, accuracy: 68.6
# Epoch: 240 | loss: 0.75, accuracy: 65.7
# Epoch: 260 | loss: 0.73, accuracy: 71.4
# Epoch: 280 | loss: 0.73, accuracy: 69.5
# Epoch: 300 | loss: 0.71, accuracy: 62.9
# Epoch: 320 | loss: 0.68, accuracy: 69.5
# Epoch: 340 | loss: 0.74, accuracy: 65.7
# Epoch: 360 | loss: 0.68, accuracy: 71.4
# Epoch: 380 | loss: 0.78, accuracy: 63.8
# Epoch: 400 | loss: 0.69, accuracy: 66.7
# Epoch: 420 | loss: 0.75, accuracy: 67.6
# Epoch: 440 | loss: 0.76, accuracy: 69.5
# Epoch: 460 | loss: 0.71, accuracy: 67.6
# Epoch: 480 | loss: 0.66, accuracy: 66.7


# Predictions
y_prob = F.softmax(model(X_test), dim=1)
y_pred = y_prob.max(dim=1)[1]

# Performance
performance = get_metrics(y_true=y_test, y_pred=y_pred, classes=classes)
print (json.dumps(performance, indent=2))

# Output
# {
#   "overall": {
#     "precision": 0.45959595959595956,
#     "recall": 0.45454545454545453,
#     "f1": 0.4512987012987013,
#     "num_samples": 22.0
#   },
#   "class": {
#     "c1": {
#       "precision": 0.5,
#       "recall": 0.375,
#       "f1": 0.42857142857142855,
#       "num_samples": 8.0
#     },
#     "c2": {
#       "precision": 0.4444444444444444,
#       "recall": 0.5714285714285714,
#       "f1": 0.5,
#       "num_samples": 7.0
#     },
#     "c3": {
#       "precision": 0.42857142857142855,
#       "recall": 0.42857142857142855,
#       "f1": 0.42857142857142855,
#       "num_samples": 7.0
#     }
#   }
# }

# Visualize the decision boundary
plt.figure(figsize=(12,5))
plt.subplot(1, 2, 1)
plt.title("Train")
plot_multiclass_decision_boundary(model=model, X=X_train, y=y_train)
plt.subplot(1, 2, 2)
plt.title("Test")
plot_multiclass_decision_boundary(model=model, X=X_test, y=y_test)
plt.show()

正如你所见，虽然模型在训练集上做到了接近70%的准确率，但模型在测试集上的表现并不能令人满意。

重要的是我们需要进行实验，从不合适（高偏差）的简单模型开始，并试图改进到良好的拟合，以及避免过拟合。

Citation

@article{madewithml,
    author       = {Goku Mohandas},
    title        = { Neural networks - Made With ML },
    howpublished = {\url{https://madewithml.com/}},
    year         = {2022}
}