Way2AI · Embeddings （上）

TL;DR

虽然One-Hot编码能够将离散变量表示为二进制向量，且能保留结构化信息，但它有两个主要的缺点：

线性依赖词表的大小。这在语料库很大的情况下会带来问题如维数巨大且稀疏
单个token的表示，不保留其相对于其它token的关系

本文将简单介绍embeddings，及它是如何解决one-hot编码的所有缺点。

Learning embeddings

我们将通过使用PyTorch建模来学习embeddings，不过首先，我们学习一下专门用于嵌入和主题建模的库Gensim。

import nltk
nltk.download("punkt")

import numpy as np
import re
import requests

np.random.seed(1024)

# Split text into sentence
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
book = requests.get("https://s3.mindex.xyz/datasets/harrypotter.txt").content
sentences = tokenizer.tokenize(str(book))
print (f"{len(sentences)} sentences")

# Output
# 12449 sentences


def preprocess(text):
    """Conditional preprocessing on our text."""
    # Lower
    text = text.lower()

    # Spacing and filters
    text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)  # separate punctuation tied to words
    text = re.sub("[^A-Za-z0-9]+", " ", text)  # remove non alphanumeric chars
    text = re.sub(" +", " ", text)  # remove multiple spaces
    text = text.strip()

    # Separate into word tokens
    text = text.split(" ")

    return text


# Preprocess sentences
print (sentences[11])
sentences = [preprocess(s) for s in sentences]
print (sentences[11])

# Output
# Snape nodded, but did not elaborate.
# ['snape', 'nodded', 'but', 'did', 'not', 'elaborate']

embeddings的核心就是单词表示，且这种表示不只是依赖单词本身，而且依赖它的上下文。我们有几种不同的方法可以实现这一目标：

给定上下文中的单词，预测目标单词（CBOW )
给定目标词，预测上下文词（skip-gram)
给定一个文本序列，预测下一个单词（ LM ）

上面这些方法都涉及到创建数据来训练模型。句子中的每个单词都成为目标单词，上下文由窗口决定。

如下图（skip-gram），窗口大小为2。我们对语料库中的每个句子重复此操作，以产生用于无监督任务的训练数据。这个任务的核心逻辑是，相似的词会出现在相似的上下文中，我们可以通过反复的使用这种(target, context)文本对来学习这种关系。

我们可以使用上述任何一种方法来应用Embeddings。在任务中到底选择哪种方案，可能更多的需要依靠在监督任务上的表现来做选择。

Word2Vec

当我们有大量的词汇表需要应用Embeddings时，事情会变得复杂。回想一下在反向传播中使用softmax更新正确的和不正确的分类权重，这种情况下每一次反向传播都意味着一个巨大的计算。因此解决方案是使用负采样，它只更新正确的类和随机一部分不正确的类（NEGATIVE_SAMPLING = 20）。

import gensim
from gensim.models import KeyedVectors
from gensim.models import Word2Vec

EMBEDDING_DIM = 100
WINDOW = 5
MIN_COUNT = 3
SKIP_GRAM = 1
NEGATIVE_SAMPLING = 20

w2v = Word2Vec(
    sentences=sentences, vector_size=EMBEDDING_DIM,
    window=WINDOW, min_count=MIN_COUNT,
    sg=SKIP_GRAM, negative=NEGATIVE_SAMPLING)
print (w2v)

# Output
# Word2Vec<vocab=4937, vector_size=100, alpha=0.025>

# Vector for each word
w2v.wv.get_vector("potter")

# Output
# array([ 0.04592679,  0.26393083, -0.29759625, -0.51007414,  0.02860732,
#        -0.01302573,  0.3703193 ,  0.14425582, -0.4187037 ,  0.04296769,
#        -0.13030362, -0.30441925, -0.14958233,  0.04964258,  0.14798391,
#        -0.18539314,  0.51730794,  0.01598365, -0.11325987, -0.6307836 ,
#         0.39244524,  0.25232184,  0.29555508, -0.22162063, -0.29100868,
#        -0.22083738, -0.52918744, -0.68654346, -0.09764519,  0.05514489,
#         0.06108054,  0.3587375 , -0.01166064, -0.42530054, -0.05000629,
#         0.45623606, -0.29811206, -0.09037815, -0.0024387 , -0.41930553,
#         0.12495753, -0.1773121 ,  0.19551197,  0.02754493,  0.25369856,
#         0.10022393, -0.38912103, -0.10274333, -0.24544689,  0.00851442,
#         0.26698554, -0.03026148,  0.12343717, -0.07433262,  0.0162609 ,
#         0.15033086,  0.09943663,  0.28371716, -0.26024884, -0.05571229,
#         0.0938114 , -0.00562614, -0.11472147,  0.21217017,  0.12490374,
#         0.34131378,  0.10346038,  0.38650215, -0.44265935, -0.02233333,
#        -0.47005087, -0.28585035,  0.06968105,  0.08989634,  0.22004889,
#        -0.22940454, -0.06248426,  0.089827  , -0.35011858,  0.11977731,
#        -0.06323916,  0.0940324 , -0.31842625,  0.53730965,  0.17043817,
#         0.15869781,  0.40275395,  0.04705542,  0.35397893,  0.00738561,
#         0.21539825,  0.14310665,  0.13341616, -0.0660746 ,  0.42496106,
#         0.09145384,  0.47487733, -0.23636843,  0.00715503,  0.05220298],
#       dtype=float32)

# Get nearest neighbors (excluding itself)
w2v.wv.most_similar(positive="scar", topn=5)

# Output
# [('forehead', 0.9045635461807251),
#  ('pain', 0.9014869928359985),
#  ('mouth', 0.8918080925941467),
#  ('prickling', 0.890386164188385),
#  ('throat', 0.8795480728149414)]

# Saving and loading
w2v.wv.save_word2vec_format("w2v.bin", binary=True)
wv = KeyedVectors.load_word2vec_format("w2v.bin", binary=True)

FastText

当一个词在我们的词汇表中不存在时会发生什么？我们可以分配一个 UNK 标识来表示为未登录词，或者使用FastText，它使用字符级的n-grams算法来embed单词，这样有助于处理罕见词,拼错的词，以及语料库中不存在但相似的词。

from gensim.models import FastText

# Super fast because of optimized C code under the hood
ft = FastText(sentences=sentences, vector_size=EMBEDDING_DIM,
              window=WINDOW, min_count=MIN_COUNT,
              sg=SKIP_GRAM, negative=NEGATIVE_SAMPLING)
print (ft)

# Output
# FastText<vocab=4937, vector_size=100, alpha=0.025>

# This word doesn't exist so the word2vec model will error out
wv.most_similar(positive='scarring', topn=5)
# Output
# KeyError: "Key 'scarring' not present in vocabulary"

# FastText will use n-grams to embed an OOV word
ft.wv.most_similar(positive='scarring', topn=5)

# Output
# [('swimming', 0.9938331246376038),
#  ('howling', 0.9927006959915161),
#  ('dabbing', 0.9923058748245239),
#  ('wriggling', 0.9921060800552368),
#  ('bulging', 0.9919766783714294)]

# Save and loading
ft.wv.save("ft.bin")
ftwv = KeyedVectors.load("ft.bin")

Pretrained embeddings

我们可以利用上述方法从头开始应用embeddings，也可以利用已经在百万文档上训练过的预训练embeddings。流行的包括Word2Vec、GloVe


# Preview of the GloVe embeddings file
with open("glove.6B.100d.txt", "r") as fp:
    line = next(fp)
    values = line.split()
    word = values[0]
    embedding = np.asarray(values[1:], dtype='float32')
    print (f"word: {word}")
    print (f"embedding:\n{embedding}")
    print (f"embedding dim: {len(embedding)}")

# Output
# word: the
# embedding:
# [-0.038194 -0.24487   0.72812  -0.39961   0.083172  0.043953 -0.39141
#   0.3344   -0.57545   0.087459  0.28787  -0.06731   0.30906  -0.26384
#  -0.13231  -0.20757   0.33395  -0.33848  -0.31743  -0.48336   0.1464
#  -0.37304   0.34577   0.052041  0.44946  -0.46971   0.02628  -0.54155
#  -0.15518  -0.14107  -0.039722  0.28277   0.14393   0.23464  -0.31021
#   0.086173  0.20397   0.52624   0.17164  -0.082378 -0.71787  -0.41531
#   0.20335  -0.12763   0.41367   0.55187   0.57908  -0.33477  -0.36559
#  -0.54857  -0.062892  0.26584   0.30205   0.99775  -0.80481  -3.0243
#   0.01254  -0.36942   2.2167    0.72201  -0.24978   0.92136   0.034514
#   0.46745   1.1079   -0.19358  -0.074575  0.23353  -0.052062 -0.22044
#   0.057162 -0.15806  -0.30798  -0.41625   0.37972   0.15006  -0.53212
#  -0.2055   -1.2526    0.071624  0.70565   0.49744  -0.42063   0.26148
#  -1.538    -0.30223  -0.073438 -0.28312   0.37104  -0.25217   0.016215
#  -0.017099 -0.38984   0.87424  -0.72569  -0.51058  -0.52028  -0.1459
#   0.8278    0.27062 ]
# embedding dim: 100

# Load embeddings (may take a minute)
glove = KeyedVectors.load_word2vec_format("glove.6B.100d.txt", binary=False, no_header=True)


# (king - man) + woman = ?
# king - man = ? -  woman
glove.most_similar(positive=["woman", "king"], negative=["man"], topn=5)

# [('queen', 0.7698540687561035),
#  ('monarch', 0.6843381524085999),
#  ('throne', 0.6755736470222473),
#  ('daughter', 0.6594556570053101),
#  ('princess', 0.6520534157752991)]


# Get nearest neighbors (excluding itself)
glove.most_similar(positive="goku", topn=5)

# [('gohan', 0.7246542572975159),
#  ('bulma', 0.6497020125389099),
#  ('raistlin', 0.644360363483429),
#  ('skaar', 0.6316742897033691),
#  ('guybrush', 0.6231325268745422)]

我们可视化一下 king, queen, man, woman 这四个单词的位置关系。

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA


# Reduce dimensionality for plotting
X = glove[glove.index_to_key]
pca = PCA(n_components=2)
pca_results = pca.fit_transform(X)

def plot_embeddings(words, embeddings, pca_results):
    for word in words:
        idx = embeddings.key_to_index[word]
        plt.scatter(pca_results[idx, 0], pca_results[idx, 1])
        plt.annotate(word, xy=(pca_results[idx, 0], pca_results[idx, 1]))
    plt.show()


# Visualize
plot_embeddings(
    words=["king", "queen", "man", "woman"], embeddings=glove,
    pca_results=pca_results)

再看一下，离woman和doctor近，但离man远的词有哪些

# Bias in embeddings
glove.most_similar(positive=["woman", "doctor"], negative=["man"], topn=5)

# Output
# [('nurse', 0.7735227942466736),
#  ('physician', 0.7189430594444275),
#  ('doctors', 0.6824328303337097),
#  ('patient', 0.6750683188438416),
#  ('dentist', 0.6726033091545105)]

Ending

下一篇，我们将进一步介绍Embeddings如何提升我们前篇介绍的CNN分类模型。