from d2l import jax as d2l
import jax
from jax import numpy as jnp
import osWhat does a trained word embedding actually capture? Two classical probes:
This deck loads pretrained GloVe vectors (300-dim, trained on a 6B-token Wikipedia corpus) and exercises both properties.
GloVe ships as text — <word> <300 floats> per line. Parse into a vocab + a tensor of vectors:
d2l.DATA_HUB['glove.6b.50d'] = (d2l.DATA_URL + 'glove.6B.50d.zip',
'0b8703943ccdb6eb788e6f091b8946e82231bc4d')
d2l.DATA_HUB['glove.6b.100d'] = (d2l.DATA_URL + 'glove.6B.100d.zip',
'cd43bfb07e44e6f27cbcc7bc9ae3d80284fdaf5a')
d2l.DATA_HUB['glove.42b.300d'] = (d2l.DATA_URL + 'glove.42B.300d.zip',
'b5116e234e9eb9076672cfeabf5469f3eec904fa')
d2l.DATA_HUB['wiki.en'] = (d2l.DATA_URL + 'wiki.en.zip',
'c1816da3821ae9f43899be655002f6c723e91b88')class TokenEmbedding:
"""Token Embedding."""
def __init__(self, embedding_name):
self.idx_to_token, self.idx_to_vec = self._load_embedding(
embedding_name)
self.unknown_idx = 0
self.token_to_idx = {token: idx for idx, token in
enumerate(self.idx_to_token)}
def _load_embedding(self, embedding_name):
idx_to_token, idx_to_vec = ['<unk>'], []
data_dir = d2l.download_extract(embedding_name)
# GloVe website: https://nlp.stanford.edu/projects/glove/
# fastText website: https://fasttext.cc/
with open(os.path.join(data_dir, 'vec.txt'), 'r') as f:
for line in f:
elems = line.rstrip().split(' ')
token, elems = elems[0], [float(elem) for elem in elems[1:]]
# Skip header information, such as the top row in fastText
if len(elems) > 1:
idx_to_token.append(token)
idx_to_vec.append(elems)
idx_to_vec = [[0] * len(idx_to_vec[0])] + idx_to_vec
return idx_to_token, d2l.tensor(idx_to_vec)
def __getitem__(self, tokens):
indices = [self.token_to_idx.get(token, self.unknown_idx)
for token in tokens]
vecs = self.idx_to_vec[d2l.tensor(indices)]
return vecs
def __len__(self):
return len(self.idx_to_token)400001
k nearest neighbors by cosine distance. Try seed words: synonyms, related concepts, named entities. The result is distributional similarity, not dictionary synonymy:
Expect topical neighbors as well as true synonyms. Static vectors collapse all senses of a word into one point, so polysemous words can produce mixed neighborhoods.
cosine sim=0.839: babies
cosine sim=0.800: boy
cosine sim=0.792: girl
\mathbf{v}_b - \mathbf{v}_a + \mathbf{v}_c \approx \mathbf{v}_d — classic A:B :: C:D analogies. Look up the nearest neighbor of the query vector to read out D.
Good analogy results mean the training corpus encoded a fairly linear relation. Bad results are still useful: they show the limits of one-vector-per-word embeddings.
'japan'