TL;DR

上篇文章简单介绍了Embeddings,以及Glove。本篇将简单介绍加入Embedding层的CNN。

注意,所有的前置工作与《Way2AI · 卷积神经网络》这篇文章里的介绍没有太大区别,最大的区别在于建模的时候加入了Embeddings层。

Set up

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# pip install numpy==1.21.2

import numpy as np

import pandas as pd
import random
import torch
import torch.nn as nn


def set_seeds(seed=1024):
"""Set seeds for reproducibility."""
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed) # multi-GPU

set_seeds(seed=1024)

cuda = True
device = torch.device("cuda" if(torch.cuda.is_available() and cuda) else "cpu")
torch.set_default_tensor_type({"cuda": "torch.cuda.FloatTensor", "cpu": "torch.FloatTensor"}.get(str(device)))

Load data

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
url = "https://s3.mindex.xyz/datasets/news.csv"
df = pd.read_csv(url, header=0)
df = df.sample(frac=1).reset_index(drop=True)
df["title"][:100]

# Output
# 0 Israel announces West Bank housing plan; barri...
# 1 Red Sox #39;s Feat: As far back as I can remember
# 2 J.P. Morgan Cancels IBM Outsourcing Deal (Reut...
# 3 Intel Names Otellini New CEO
# 4 Branson Launches Virgin Atlantic Flights to Au...
# ...
# 95 Yahoo Profit Surges on Sales of Ads, Google Stock
# 96 DirecT Touchdown
# 97 Struggling Bucs Best Dismal Bears, 19-7 (AP)
# 98 Romania PM, Bucharest Mayor Battle for Preside...
# 99 Glazer Quest for United Falters
# Name: title, Length: 100, dtype: object

Processing

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

nltk.download("stopwords")
STOPWORDS = stopwords.words("english")
print (STOPWORDS[:5])
porter = PorterStemmer()

def preprocess(text, stopwords=STOPWORDS):
"""Conditional preprocessing on our text unique to our task."""
# Lower
text = text.lower()

# Remove stopwords
pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
text = pattern.sub("", text)

# Remove words in parenthesis
text = re.sub(r"\([^)]*\)", "", text)

# Spacing and filters
text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)
text = re.sub("[^A-Za-z0-9]+", " ", text) # remove non alphanumeric chars
text = re.sub(" +", " ", text) # remove multiple spaces
text = text.strip()

return text


# Apply to dataframe
preprocessed_df = df.copy()
preprocessed_df.title = preprocessed_df.title.apply(preprocess)

Split data

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import collections
from sklearn.model_selection import train_test_split

TRAIN_SIZE = 0.7
VAL_SIZE = 0.15
TEST_SIZE = 0.15


def train_val_test_split(X, y, train_size):
"""Split dataset into data splits."""
X_train, X_, y_train, y_ = train_test_split(X, y, train_size=TRAIN_SIZE, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_, y_, train_size=0.5, stratify=y_)
return X_train, X_val, X_test, y_train, y_val, y_test


# Data
X = preprocessed_df["title"].values
y = preprocessed_df["category"].values


# Create data splits
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(
X=X, y=y, train_size=TRAIN_SIZE)
print (f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print (f"X_val: {X_val.shape}, y_val: {y_val.shape}")
print (f"X_test: {X_test.shape}, y_test: {y_test.shape}")
print (f"Sample point: {X_train[0]}{y_train[0]}")


# Output
# X_train: (84000,), y_train: (84000,)
# X_val: (18000,), y_val: (18000,)
# X_test: (18000,), y_test: (18000,)
# Sample point: ibm wins time talks pension case → Sci/Tech

Label encoding

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import itertools


class LabelEncoder(object):
"""Label encoder for tag labels."""
def __init__(self, class_to_index={}):
self.class_to_index = class_to_index or {} # mutable defaults ;)
self.index_to_class = {v: k for k, v in self.class_to_index.items()}
self.classes = list(self.class_to_index.keys())

def __len__(self):
return len(self.class_to_index)

def __str__(self):
return f"<LabelEncoder(num_classes={len(self)})>"

def fit(self, y):
classes = np.unique(y)
for i, class_ in enumerate(classes):
self.class_to_index[class_] = i
self.index_to_class = {v: k for k, v in self.class_to_index.items()}
self.classes = list(self.class_to_index.keys())
return self

def encode(self, y):
encoded = np.zeros((len(y)), dtype=int)
for i, item in enumerate(y):
encoded[i] = self.class_to_index[item]
return encoded

def decode(self, y):
classes = []
for i, item in enumerate(y):
classes.append(self.index_to_class[item])
return classes

def save(self, fp):
with open(fp, "w") as fp:
contents = {'class_to_index': self.class_to_index}
json.dump(contents, fp, indent=4, sort_keys=False)

@classmethod
def load(cls, fp):
with open(fp, "r") as fp:
kwargs = json.load(fp=fp)
return cls(**kwargs)


# Encode
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
NUM_CLASSES = len(label_encoder)
print(label_encoder.class_to_index)

# Output
# {'Business': 0, 'Sci/Tech': 1, 'Sports': 2, 'World': 3}

# Convert labels to tokens
print (f"y_train[0]: {y_train[0]}")
y_train = label_encoder.encode(y_train)
y_val = label_encoder.encode(y_val)
y_test = label_encoder.encode(y_test)
print (f"y_train[0]: {y_train[0]}")

# Output
# y_train[0]: Sci/Tech
# y_train[0]: 1


# Class weights
counts = np.bincount(y_train)
class_weights = {i: 1.0/count for i, count in enumerate(counts)}
print (f"counts: {counts}\nweights: {class_weights}")

# Output
# counts: [21000 21000 21000 21000]
# weights: {0: 4.761904761904762e-05, 1: 4.761904761904762e-05, 2: 4.761904761904762e-05, 3: 4.761904761904762e-05}

Tokenizer

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import json
from collections import Counter
from more_itertools import take


class Tokenizer(object):
def __init__(self, char_level, num_tokens=None,
pad_token="<PAD>", oov_token="<UNK>",
token_to_index=None):
self.char_level = char_level
self.separator = "" if self.char_level else " "
if num_tokens: num_tokens -= 2 # pad + unk tokens
self.num_tokens = num_tokens
self.pad_token = pad_token
self.oov_token = oov_token
if not token_to_index:
token_to_index = {pad_token: 0, oov_token: 1}
self.token_to_index = token_to_index
self.index_to_token = {v: k for k, v in self.token_to_index.items()}

def __len__(self):
return len(self.token_to_index)

def __str__(self):
return f"<Tokenizer(num_tokens={len(self)})>"

def fit_on_texts(self, texts):
if not self.char_level:
texts = [text.split(" ") for text in texts]
all_tokens = [token for text in texts for token in text]
counts = Counter(all_tokens).most_common(self.num_tokens)
self.min_token_freq = counts[-1][1]
for token, count in counts:
index = len(self)
self.token_to_index[token] = index
self.index_to_token[index] = token
return self

def texts_to_sequences(self, texts):
sequences = []
for text in texts:
if not self.char_level:
text = text.split(" ")
sequence = []
for token in text:
sequence.append(self.token_to_index.get(
token, self.token_to_index[self.oov_token]))
sequences.append(np.asarray(sequence))
return sequences

def sequences_to_texts(self, sequences):
texts = []
for sequence in sequences:
text = []
for index in sequence:
text.append(self.index_to_token.get(index, self.oov_token))
texts.append(self.separator.join([token for token in text]))
return texts

def save(self, fp):
with open(fp, "w") as fp:
contents = {
"char_level": self.char_level,
"oov_token": self.oov_token,
"token_to_index": self.token_to_index
}
json.dump(contents, fp, indent=4, sort_keys=False)

@classmethod
def load(cls, fp):
with open(fp, "r") as fp:
kwargs = json.load(fp=fp)
return cls(**kwargs)


# Tokenize
tokenizer = Tokenizer(char_level=False, num_tokens=5000)
tokenizer.fit_on_texts(texts=X_train)
VOCAB_SIZE = len(tokenizer)
print (tokenizer)

# Output
# <Tokenizer(num_tokens=5000)>


# Sample of tokens
print (take(5, tokenizer.token_to_index.items()))
print (f"least freq token's freq: {tokenizer.min_token_freq}") # use this to adjust num_tokens

# Output
# [('<PAD>', 0), ('<UNK>', 1), ('39', 2), ('b', 3), ('gt', 4)]
# least freq token's freq: 14


# Convert texts to sequences of indices
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(X_test)
preprocessed_text = tokenizer.sequences_to_texts([X_train[0]])[0]
print ("Text to indices:\n"
f" (preprocessed) → {preprocessed_text}\n"
f" (tokenized) → {X_train[0]}")

# Output
# Text to indices:
# (preprocessed) → ibm wins time talks pension case
# (tokenized) → [ 31 32 69 26 715 100]

Padding

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
def pad_sequences(sequences, max_seq_len=0):
"""Pad sequences to max length in sequence."""
max_seq_len = max(max_seq_len, max(len(sequence) for sequence in sequences))
padded_sequences = np.zeros((len(sequences), max_seq_len))
for i, sequence in enumerate(sequences):
padded_sequences[i][:len(sequence)] = sequence
return padded_sequences


# 2D sequences
padded = pad_sequences(X_train[0:3])
print (padded.shape)
print (padded)

# Output
# (3, 8)
# [[3.100e+01 3.200e+01 6.900e+01 2.600e+01 7.150e+02 1.000e+02 0.000e+00
# 0.000e+00]
# [3.568e+03 9.000e+00 4.520e+03 2.000e+00 1.000e+00 2.396e+03 7.760e+02
# 1.500e+01]
# [1.000e+01 1.094e+03 7.600e+01 5.960e+02 5.740e+02 8.000e+02 0.000e+00
# 0.000e+00]]

Dataset

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
FILTER_SIZES = list(range(2, 5)) # bi, tri and 4 grams


class Dataset(torch.utils.data.Dataset):
def __init__(self, X, y, max_filter_size):
self.X = X
self.y = y
self.max_filter_size = max_filter_size

def __len__(self):
return len(self.y)

def __str__(self):
return f"<Dataset(N={len(self)})>"

def __getitem__(self, index):
X = self.X[index]
y = self.y[index]
return [X, y]

def collate_fn(self, batch):
"""Processing on a batch."""
# Get inputs
batch = np.array(batch)
X = batch[:, 0]
y = batch[:, 1]

# Pad sequences
X = pad_sequences(X)

# Cast
X = torch.LongTensor(X.astype(np.int32))
y = torch.LongTensor(y.astype(np.int32))

return X, y

def create_dataloader(self, batch_size, shuffle=False, drop_last=False):
return torch.utils.data.DataLoader(
dataset=self, batch_size=batch_size, collate_fn=self.collate_fn,
shuffle=shuffle, drop_last=drop_last, pin_memory=True)

# Create datasets
max_filter_size = max(FILTER_SIZES)
train_dataset = Dataset(X=X_train, y=y_train, max_filter_size=max_filter_size)
val_dataset = Dataset(X=X_val, y=y_val, max_filter_size=max_filter_size)
test_dataset = Dataset(X=X_test, y=y_test, max_filter_size=max_filter_size)
print ("Datasets:\n"
f" Train dataset:{train_dataset.__str__()}\n"
f" Val dataset: {val_dataset.__str__()}\n"
f" Test dataset: {test_dataset.__str__()}\n"
"Sample point:\n"
f" X: {train_dataset[0][0]}\n"
f" y: {train_dataset[0][1]}")


# Output
# Datasets:
# Train dataset:<Dataset(N=84000)>
# Val dataset: <Dataset(N=18000)>
# Test dataset: <Dataset(N=18000)>
# Sample point:
# X: [ 31 32 69 26 715 100]
# y: 1


# Create dataloaders
batch_size = 64
train_dataloader = train_dataset.create_dataloader(batch_size=batch_size)
val_dataloader = val_dataset.create_dataloader(batch_size=batch_size)
test_dataloader = test_dataset.create_dataloader(batch_size=batch_size)
batch_X, batch_y = next(iter(train_dataloader))
print ("Sample batch:\n"
f" X: {list(batch_X.size())}\n"
f" y: {list(batch_y.size())}\n"
"Sample point:\n"
f" X: {batch_X[0]}\n"
f" y: {batch_y[0]}")

# Output
# Sample batch:
# X: [64, 10]
# y: [64]
# Sample point:
# X: tensor([ 31, 32, 69, 26, 715, 100, 0, 0, 0, 0])
# y: 1

Model

可视化一下模型的前向传播.

  • 首先对输入tokenizer化 (batch_size, max_seq_len)
  • 然后我们对tokenizered输入进行embed (batch_size, max_seq_len, embedding_dim)
  • 接下来,使用filters(filter_size, vocab_size, num_filter)进行卷积,然后批归一化。我们讲使用三个不同size的filter(2, 3 和 4)分别充当bi-gram, tri-gram 和 4-gram 特征提取器。
  • 紧跟着,应用一维max polling,从特征图中提取最相关信息以做出决策
  • 再接一个含dropout的全连接层
  • 最后再使用一个softmax全连接层以输出最终的类别概率

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import math
import torch.nn.functional as F

EMBEDDING_DIM = 100
HIDDEN_DIM = 100
DROPOUT_P = 0.1


class CNN(nn.Module):
def __init__(self, embedding_dim, vocab_size, num_filters,
filter_sizes, hidden_dim, dropout_p, num_classes,
pretrained_embeddings=None, freeze_embeddings=False,
padding_idx=0):
super(CNN, self).__init__()

# Filter sizes
self.filter_sizes = filter_sizes

# Initialize embeddings
if pretrained_embeddings is None:
self.embeddings = nn.Embedding(
embedding_dim=embedding_dim, num_embeddings=vocab_size,
padding_idx=padding_idx)
else:
pretrained_embeddings = torch.from_numpy(pretrained_embeddings).float()
self.embeddings = nn.Embedding(
embedding_dim=embedding_dim, num_embeddings=vocab_size,
padding_idx=padding_idx, _weight=pretrained_embeddings)

# Freeze embeddings or not
if freeze_embeddings:
self.embeddings.weight.requires_grad = False

# Conv weights
self.conv = nn.ModuleList(
[nn.Conv1d(in_channels=embedding_dim,
out_channels=num_filters,
kernel_size=f) for f in filter_sizes])

# FC weights
self.dropout = nn.Dropout(dropout_p)
self.fc1 = nn.Linear(num_filters*len(filter_sizes), hidden_dim)
self.fc2 = nn.Linear(hidden_dim, num_classes)

def forward(self, inputs, channel_first=False):

# Embed
x_in, = inputs
x_in = self.embeddings(x_in)

# Rearrange input so num_channels is in dim 1 (N, C, L)
if not channel_first:
x_in = x_in.transpose(1, 2)

# Conv outputs
z = []
max_seq_len = x_in.shape[2]
for i, f in enumerate(self.filter_sizes):
# `SAME` padding
padding_left = int((self.conv[i].stride[0]*(max_seq_len-1) - max_seq_len + self.filter_sizes[i])/2)
padding_right = int(math.ceil((self.conv[i].stride[0]*(max_seq_len-1) - max_seq_len + self.filter_sizes[i])/2))

# Conv + pool
_z = self.conv[i](F.pad(x_in, (padding_left, padding_right)))
_z = F.max_pool1d(_z, _z.size(2)).squeeze(2)
z.append(_z)

# Concat conv outputs
z = torch.cat(z, 1)

# FC layers
z = self.fc1(z)
z = self.dropout(z)
z = self.fc2(z)
return z

Using GloVe

先实现一些方便能够将预训练的GloVe加载到我们的模型中的公共方法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
def load_glove_embeddings(embeddings_file):
"""Load embeddings from a file."""
embeddings = {}
with open(embeddings_file, "r") as fp:
for index, line in enumerate(fp):
values = line.split()
word = values[0]
embedding = np.asarray(values[1:], dtype='float32')
embeddings[word] = embedding
return embeddings

def make_embeddings_matrix(embeddings, word_index, embedding_dim):
"""Create embeddings matrix to use in Embedding layer."""
embedding_matrix = np.zeros((len(word_index), embedding_dim))
for word, i in word_index.items():
embedding_vector = embeddings.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
return embedding_matrix


# Create embeddings
embeddings_file = 'glove.6B.{0}d.txt'.format(EMBEDDING_DIM)
glove_embeddings = load_glove_embeddings(embeddings_file=embeddings_file)
embedding_matrix = make_embeddings_matrix(
embeddings=glove_embeddings, word_index=tokenizer.token_to_index,
embedding_dim=EMBEDDING_DIM)
print (f"<Embeddings(words={embedding_matrix.shape[0]}, dim={embedding_matrix.shape[1]})>")


# Output
# <Embeddings(words=5000, dim=100)>

Experiments

接下来,我们将进行三个实验:

  • 随机初始化的embeddings (fine-tuned)
  • GloVe embeddings (frozen)
  • GloVe embeddings (fine-tuned)

先定义我们的Trainer

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import json
from sklearn.metrics import precision_recall_fscore_support
from torch.optim import Adam

NUM_FILTERS = 50
LEARNING_RATE = 1e-3
PATIENCE = 5
NUM_EPOCHS = 10

class Trainer(object):
def __init__(self, model, device, loss_fn=None, optimizer=None, scheduler=None):

# Set params
self.model = model
self.device = device
self.loss_fn = loss_fn
self.optimizer = optimizer
self.scheduler = scheduler

def train_step(self, dataloader):
"""Train step."""
# Set model to train mode
self.model.train()
loss = 0.0

# Iterate over train batches
for i, batch in enumerate(dataloader):

# Step
batch = [item.to(self.device) for item in batch] # Set device
inputs, targets = batch[:-1], batch[-1]
self.optimizer.zero_grad() # Reset gradients
z = self.model(inputs) # Forward pass
J = self.loss_fn(z, targets) # Define loss
J.backward() # Backward pass
self.optimizer.step() # Update weights

# Cumulative Metrics
loss += (J.detach().item() - loss) / (i + 1)

return loss

def eval_step(self, dataloader):
"""Validation or test step."""
# Set model to eval mode
self.model.eval()
loss = 0.0
y_trues, y_probs = [], []

# Iterate over val batches
with torch.inference_mode():
for i, batch in enumerate(dataloader):

# Step
batch = [item.to(self.device) for item in batch] # Set device
inputs, y_true = batch[:-1], batch[-1]
z = self.model(inputs) # Forward pass
J = self.loss_fn(z, y_true).item()

# Cumulative Metrics
loss += (J - loss) / (i + 1)

# Store outputs
y_prob = F.softmax(z).cpu().numpy()
y_probs.extend(y_prob)
y_trues.extend(y_true.cpu().numpy())

return loss, np.vstack(y_trues), np.vstack(y_probs)

def predict_step(self, dataloader):
"""Prediction step."""
# Set model to eval mode
self.model.eval()
y_probs = []

# Iterate over val batches
with torch.inference_mode():
for i, batch in enumerate(dataloader):

# Forward pass w/ inputs
inputs, targets = batch[:-1], batch[-1]
z = self.model(inputs)

# Store outputs
y_prob = F.softmax(z).cpu().numpy()
y_probs.extend(y_prob)

return np.vstack(y_probs)

def train(self, num_epochs, patience, train_dataloader, val_dataloader):
best_val_loss = np.inf
for epoch in range(num_epochs):
# Steps
train_loss = self.train_step(dataloader=train_dataloader)
val_loss, _, _ = self.eval_step(dataloader=val_dataloader)
self.scheduler.step(val_loss)

# Early stopping
if val_loss < best_val_loss:
best_val_loss = val_loss
best_model = self.model
_patience = patience # reset _patience
else:
_patience -= 1
if not _patience: # 0
print("Stopping early!")
break

# Logging
print(
f"Epoch: {epoch+1} | "
f"train_loss: {train_loss:.5f}, "
f"val_loss: {val_loss:.5f}, "
f"lr: {self.optimizer.param_groups[0]['lr']:.2E}, "
f"_patience: {_patience}"
)
return best_model


def get_metrics(y_true, y_pred, classes):
"""Per-class performance metrics."""
# Performance
performance = {"overall": {}, "class": {}}

# Overall performance
metrics = precision_recall_fscore_support(y_true, y_pred, average="weighted")
performance["overall"]["precision"] = metrics[0]
performance["overall"]["recall"] = metrics[1]
performance["overall"]["f1"] = metrics[2]
performance["overall"]["num_samples"] = np.float64(len(y_true))

# Per-class performance
metrics = precision_recall_fscore_support(y_true, y_pred, average=None)
for i in range(len(classes)):
performance["class"][classes[i]] = {
"precision": metrics[0][i],
"recall": metrics[1][i],
"f1": metrics[2][i],
"num_samples": np.float64(metrics[3][i]),
}

return performance

Random initialization

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
PRETRAINED_EMBEDDINGS = None
FREEZE_EMBEDDINGS = False

# Initialize model
model = CNN(
embedding_dim=EMBEDDING_DIM, vocab_size=VOCAB_SIZE,
num_filters=NUM_FILTERS, filter_sizes=FILTER_SIZES,
hidden_dim=HIDDEN_DIM, dropout_p=DROPOUT_P, num_classes=NUM_CLASSES,
pretrained_embeddings=PRETRAINED_EMBEDDINGS, freeze_embeddings=FREEZE_EMBEDDINGS)
model = model.to(device) # set device
print (model.named_parameters)

# Output
# <bound method Module.named_parameters of CNN(
# (embeddings): Embedding(5000, 100, padding_idx=0)
# (conv): ModuleList(
# (0): Conv1d(100, 50, kernel_size=(2,), stride=(1,))
# (1): Conv1d(100, 50, kernel_size=(3,), stride=(1,))
# (2): Conv1d(100, 50, kernel_size=(4,), stride=(1,))
# )
# (dropout): Dropout(p=0.1, inplace=False)
# (fc1): Linear(in_features=150, out_features=100, bias=True)
# (fc2): Linear(in_features=100, out_features=4, bias=True)
# )>

# Define Loss
class_weights_tensor = torch.Tensor(list(class_weights.values())).to(device)
loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)

# Define optimizer & scheduler
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
optimizer, mode="min", factor=0.1, patience=3)

# Trainer module
trainer = Trainer(model=model, device=device, loss_fn=loss_fn,
optimizer=optimizer, scheduler=scheduler)

# Train
best_model = trainer.train(NUM_EPOCHS, PATIENCE, train_dataloader, val_dataloader)


# Output
# Epoch: 1 | train_loss: 0.78800, val_loss: 0.64168, lr: 1.00E-03, _patience: 5
# Epoch: 2 | train_loss: 0.49324, val_loss: 0.60757, lr: 1.00E-03, _patience: 5
# Epoch: 3 | train_loss: 0.38917, val_loss: 0.63572, lr: 1.00E-03, _patience: 4
# Epoch: 4 | train_loss: 0.31891, val_loss: 0.70638, lr: 1.00E-03, _patience: 3
# Epoch: 5 | train_loss: 0.26606, val_loss: 0.76403, lr: 1.00E-03, _patience: 2
# Epoch: 6 | train_loss: 0.22631, val_loss: 0.79747, lr: 1.00E-04, _patience: 1
# Stopping early!


# Get predictions
test_loss, y_true, y_prob = trainer.eval_step(dataloader=test_dataloader)
y_pred = np.argmax(y_prob, axis=1)

# Determine performance
performance = get_metrics(
y_true=y_test, y_pred=y_pred, classes=label_encoder.classes)
print (json.dumps(performance["overall"], indent=2))

# Output
# {
# "precision": 0.8065551302331581,
# "recall": 0.8066666666666666,
# "f1": 0.8062901077799052,
# "num_samples": 18000.0
# }

Glove (frozen)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
PRETRAINED_EMBEDDINGS = embedding_matrix
FREEZE_EMBEDDINGS = True

# Initialize model
model = CNN(
embedding_dim=EMBEDDING_DIM, vocab_size=VOCAB_SIZE,
num_filters=NUM_FILTERS, filter_sizes=FILTER_SIZES,
hidden_dim=HIDDEN_DIM, dropout_p=DROPOUT_P, num_classes=NUM_CLASSES,
pretrained_embeddings=PRETRAINED_EMBEDDINGS, freeze_embeddings=FREEZE_EMBEDDINGS)
model = model.to(device) # set device
print (model.named_parameters)

# Output
# <bound method Module.named_parameters of CNN(
# (embeddings): Embedding(5000, 100, padding_idx=0)
# (conv): ModuleList(
# (0): Conv1d(100, 50, kernel_size=(2,), stride=(1,))
# (1): Conv1d(100, 50, kernel_size=(3,), stride=(1,))
# (2): Conv1d(100, 50, kernel_size=(4,), stride=(1,))
# )
# (dropout): Dropout(p=0.1, inplace=False)
# (fc1): Linear(in_features=150, out_features=100, bias=True)
# (fc2): Linear(in_features=100, out_features=4, bias=True)
# )>

# Define Loss
class_weights_tensor = torch.Tensor(list(class_weights.values())).to(device)
loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)

# Define optimizer & scheduler
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
optimizer, mode="min", factor=0.1, patience=3)

# Trainer module
trainer = Trainer(model=model, device=device, loss_fn=loss_fn,
optimizer=optimizer, scheduler=scheduler)

# Train
best_model = trainer.train(NUM_EPOCHS, PATIENCE, train_dataloader, val_dataloader)

# Output
# Epoch: 1 | train_loss: 0.51462, val_loss: 0.49800, lr: 1.00E-03, _patience: 5
# Epoch: 2 | train_loss: 0.43604, val_loss: 0.49792, lr: 1.00E-03, _patience: 5
# Epoch: 3 | train_loss: 0.39698, val_loss: 0.50526, lr: 1.00E-03, _patience: 4
# Epoch: 4 | train_loss: 0.36507, val_loss: 0.51659, lr: 1.00E-03, _patience: 3
# Epoch: 5 | train_loss: 0.33745, val_loss: 0.53612, lr: 1.00E-03, _patience: 2
# Epoch: 6 | train_loss: 0.31418, val_loss: 0.56722, lr: 1.00E-04, _patience: 1
# Stopping early!


# Get predictions
test_loss, y_true, y_prob = trainer.eval_step(dataloader=test_dataloader)
y_pred = np.argmax(y_prob, axis=1)

# Determine performance
performance = get_metrics(
y_true=y_test, y_pred=y_pred, classes=label_encoder.classes)
print (json.dumps(performance["overall"], indent=2))


# Output
# {
# "precision": 0.8264024010717701,
# "recall": 0.8269444444444445,
# "f1": 0.8263287754212785,
# "num_samples": 18000.0
# }

GloVe (fine-tuned)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
PRETRAINED_EMBEDDINGS = embedding_matrix
FREEZE_EMBEDDINGS = False

# Initialize model
model = CNN(
embedding_dim=EMBEDDING_DIM, vocab_size=VOCAB_SIZE,
num_filters=NUM_FILTERS, filter_sizes=FILTER_SIZES,
hidden_dim=HIDDEN_DIM, dropout_p=DROPOUT_P, num_classes=NUM_CLASSES,
pretrained_embeddings=PRETRAINED_EMBEDDINGS, freeze_embeddings=FREEZE_EMBEDDINGS)
model = model.to(device) # set device
print (model.named_parameters)

# Output
# <bound method Module.named_parameters of CNN(
# (embeddings): Embedding(5000, 100, padding_idx=0)
# (conv): ModuleList(
# (0): Conv1d(100, 50, kernel_size=(2,), stride=(1,))
# (1): Conv1d(100, 50, kernel_size=(3,), stride=(1,))
# (2): Conv1d(100, 50, kernel_size=(4,), stride=(1,))
# )
# (dropout): Dropout(p=0.1, inplace=False)
# (fc1): Linear(in_features=150, out_features=100, bias=True)
# (fc2): Linear(in_features=100, out_features=4, bias=True)
# )>


# Define Lossclass_weights_tensor = torch.Tensor(list(class_weights.values())).to(device)
loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)

# Define optimizer & scheduler
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
optimizer, mode="min", factor=0.1, patience=3)

# Trainer module
trainer = Trainer(
model=model, device=device, loss_fn=loss_fn,
optimizer=optimizer, scheduler=scheduler)

# Train
best_model = trainer.train(
NUM_EPOCHS, PATIENCE, train_dataloader, val_dataloader)

# Output
# Epoch: 1 | train_loss: 0.48751, val_loss: 0.45729, lr: 1.00E-03, _patience: 5
# Epoch: 2 | train_loss: 0.38391, val_loss: 0.45669, lr: 1.00E-03, _patience: 5
# Epoch: 3 | train_loss: 0.33045, val_loss: 0.47826, lr: 1.00E-03, _patience: 4
# Epoch: 4 | train_loss: 0.27825, val_loss: 0.52608, lr: 1.00E-03, _patience: 3
# Epoch: 5 | train_loss: 0.22646, val_loss: 0.60470, lr: 1.00E-03, _patience: 2
# Epoch: 6 | train_loss: 0.18130, val_loss: 0.70291, lr: 1.00E-04, _patience: 1
# Stopping early!


# Get predictions
test_loss, y_true, y_prob = trainer.eval_step(dataloader=test_dataloader)
y_pred = np.argmax(y_prob, axis=1)

# Determine performance
performance = get_metrics(
y_true=y_test, y_pred=y_pred, classes=label_encoder.classes)
print (json.dumps(performance["overall"], indent=2))

# Output
# {
# "precision": 0.8246875013006352,
# "recall": 0.8251666666666667,
# "f1": 0.8248028697657125,
# "num_samples": 18000.0
# }

Ok, 保存一些必要的模型数据,以供后续能够完整的加载和使用。

1
2
3
4
5
6
7
8
9
# Save artifacts
from pathlib import Path
dir = Path("cnn")
dir.mkdir(parents=True, exist_ok=True)
label_encoder.save(fp=Path(dir, "label_encoder.json"))
tokenizer.save(fp=Path(dir, "tokenizer.json"))
torch.save(best_model.state_dict(), Path(dir, "model.pt"))
with open(Path(dir, "performance.json"), "w") as fp:
json.dump(performance, indent=2, sort_keys=False, fp=fp)

Inference

接下来看看如何利用模型进行推理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68

def get_probability_distribution(y_prob, classes):
"""Create a dict of class probabilities from an array."""
results = {}
for i, class_ in enumerate(classes):
results[class_] = np.float64(y_prob[i])
sorted_results = {k: v for k, v in sorted(
results.items(), key=lambda item: item[1], reverse=True)}
return sorted_results

# Load artifacts
device = torch.device("cpu")
label_encoder = LabelEncoder.load(fp=Path(dir, "label_encoder.json"))
tokenizer = Tokenizer.load(fp=Path(dir, "tokenizer.json"))
model = CNN(
embedding_dim=EMBEDDING_DIM, vocab_size=VOCAB_SIZE,
num_filters=NUM_FILTERS, filter_sizes=FILTER_SIZES,
hidden_dim=HIDDEN_DIM, dropout_p=DROPOUT_P, num_classes=NUM_CLASSES,
pretrained_embeddings=PRETRAINED_EMBEDDINGS, freeze_embeddings=FREEZE_EMBEDDINGS)
model.load_state_dict(torch.load(Path(dir, "model.pt"), map_location=device))
model.to(device)

# Output
# CNN(
# (embeddings): Embedding(5000, 100, padding_idx=0)
# (conv): ModuleList(
# (0): Conv1d(100, 50, kernel_size=(2,), stride=(1,))
# (1): Conv1d(100, 50, kernel_size=(3,), stride=(1,))
# (2): Conv1d(100, 50, kernel_size=(4,), stride=(1,))
# )
# (dropout): Dropout(p=0.1, inplace=False)
# (fc1): Linear(in_features=150, out_features=100, bias=True)
# (fc2): Linear(in_features=100, out_features=4, bias=True)
# )

# Initialize trainer
trainer = Trainer(model=model, device=device)


# Dataloader
text = "The final tennis tournament starts next week."
X = tokenizer.texts_to_sequences([preprocess(text)])
print (tokenizer.sequences_to_texts(X))

# Output
# ['final tennis tournament starts next week']


y_filler = label_encoder.encode([label_encoder.classes[0]]*len(X))
dataset = Dataset(X=X, y=y_filler, max_filter_size=max_filter_size)
dataloader = dataset.create_dataloader(batch_size=batch_size)

# Inference
y_prob = trainer.predict_step(dataloader)
y_pred = np.argmax(y_prob, axis=1)
label_encoder.decode(y_pred)

# Class distributions
prob_dist = get_probability_distribution(y_prob=y_prob[0], classes=label_encoder.classes)
print (json.dumps(prob_dist, indent=2))

# Output
# {
# "Sports": 1.0,
# "World": 7.881690092248483e-12,
# "Sci/Tech": 1.270132816196673e-13,
# "Business": 2.3282168800871726e-18
# }

推理结果是 “The final tennis tournament starts next week.” 这篇文章属于 “Sports” 这个分类。

我们可以看看不同的n-gram提取器,在最大池化层里提取都是什么。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
sample_index = 0
print (f"Original text:\n{text}")
print (f"\nPreprocessed text:\n{tokenizer.sequences_to_texts(X)[0]}")
print ("\nMost important n-grams:")
# Process conv outputs for each unique filter size
for i, filter_size in enumerate(FILTER_SIZES):

# Identify most important n-gram (excluding last token)
popular_indices = collections.Counter([np.argmax(conv_output) \
for conv_output in conv_outputs[i]])

# Get corresponding text
start = popular_indices.most_common(1)[-1][0]
n_gram = " ".join([token for token in tokens[start:start+filter_size]])
print (f"[{filter_size}-gram]: {n_gram}")

# Output
# Original text:
# The final tennis tournament starts next week.
#
# Preprocessed text:
# final tennis tournament starts next week
#
# Most important n-grams:
# [2-gram]: tennis tournament
# [3-gram]: final tennis tournament
# [4-gram]: final tennis tournament starts

Ending

如你所见,加入Embedding层的卷积神经网络模型的表现,相较于只有one-hot编码的模型,性能上有了很大的提升。