40.时间序列表示

Sequence representation

[seq_len, feature_len]
[word, word_vec]
one-hot
[words, word vec]
sparse
high-dim
semantic similarity

word2vec vs Glove

import torch
import torch.nn as nn
from torchnlp.word_to_vector import GloVe
word_to_idx = { 'hello': 0, 'world': 1}
lookup_tensor = torch.tensor([word_to_idx['hello']], dtype=torch.long)
embeds = nn.Embedding(2, 5) # 2 words in vocab, 5 dimensional embeddings
hello_embed = embeds(lookup_tensor)
print(hello_embed)
""" tensor([[ 0.2565, -0.2827, -0.0259, -1.9533, 0.8330]], grad_fn=<EmbeddingBackward>) """
vectors = GloVe()
print(vectors['hello']) # 2GB文件

torchnlp包安装 pip install pytorch-nlp

41.循环神经网络

Weight sharing
Consistent memory

42.RNN Layer使用

input dim, hidden dim

rnn = nn.RNN(100, 10)   # word_dim, memory/h
print(rnn._parameters.keys())   # odict_keys(['weight_ih_l0', 'weight_hh_l0', 'bias_ih_l0', 'bias_hh_l0'])
print(rnn.weight_hh_l0.shape, rnn.weight_ih_l0.shape)   # torch.Size([10, 10]) torch.Size([10, 100])
print(rnn.bias_hh_l0.shape, rnn.bias_ih_l0.shape)   # torch.Size([10]) torch.Size([10])

42.1 nn.RNN

.__init__
(input_size, hidden_size, num_layers)
out, ht = forward(x, h0)
x: [seq_len, b, word_vec]
ho/ht: [num_layers, b, h_dim]
out: [seq_len, b, h_dim]

Single layer RNN

rnn = nn.RNN(input_size=100, hidden_size=20, num_layers=1)
print(rnn)  # RNN(100, 20)
x = torch.randn(10, 3, 100)
out, h = rnn(x, torch.zeros(1, 3, 20))
print(out.shape, h.shape)   # torch.Size([10, 3, 20]) torch.Size([1, 3, 20])

h是最后一个时间序列所有的memory状态；out是所有时间序列的最后一个memory状态

2 layer RNN

rnn = nn.RNN(100, 10, num_layers=2)   # word_dim, memory/h
print(rnn._parameters.keys())   # odict_keys(['weight_ih_l0', 'weight_hh_l0', 'bias_ih_l0', 'bias_hh_l0', 'weight_ih_l1', 'weight_hh_l1', 'bias_ih_l1', 'bias_hh_l1'])
print(rnn.weight_hh_l0.shape, rnn.weight_ih_l0.shape)   # torch.Size([10, 10]) torch.Size([10, 100])
print(rnn.bias_hh_l0.shape, rnn.bias_ih_l0.shape)   # torch.Size([10]) torch.Size([10])

[T, b, h_dim], [layers, b, h_dim]

rnn = nn.RNN(input_size=100, hidden_size=20, num_layers=4)
print(rnn)  # RNN(100, 20, num_layers=4)
x = torch.randn(10, 3, 100)
out, h = rnn(x)
print(out.shape, h.shape)   # torch.Size([10, 3, 20]) torch.Size([4, 3, 20])

42.2 nn.RNNCell

__init__
(input_size, hidden_size, num_layers)
ht = rnncell(xt, ht_1)
x: [b, word_vec]
ht_1/ht: [num_layers, b, h_dim]
out = torch.stack([h1, h2,…ht])

Functional

cell1 = nn.RNNCell(100, 30)
cell2 = nn.RNNCell(30, 20)
h1 = torch.zeros(3, 30)
h2 = torch.zeros(3, 20)
for xt in x:
    h1 = cell1(xt, h1)
    h2 = cell2(h1, h2)
print(h1.shape)     # torch.Size([3, 30])
print(h2.shape)     # torch.Size([3, 20])

43. 时间序列预测

import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
num_time_steps = 50
input_size = 1
hidden_size = 16
output_size = 1
lr = 0.01
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.rnn = nn.RNN(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=1,
            batch_first=True
        )
        for p in self.rnn.parameters():
            nn.init.normal_(p, mean=0.0, std=0.001)
        self.linear = nn.Linear(hidden_size, output_size)
    def forward(self, x, hidden_prev):
        out, hidden_prev = self.rnn(x, hidden_prev)
        # [b, seq, h] => [seq, h] b=1
        out = out.view(-1, hidden_size)
        # [seq, h] => [seq, 1]
        out = self.linear(out)
        # [seq, 1] => [1, seq, 1]
        out = out.unsqueeze(dim=0)
        return out, hidden_prev
model = Net()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr)
hidden_prev = torch.zeros(1, 1, hidden_size)
for iter in range(6000):
    start = np.random.randint(3, size=1)[0]
    time_steps = np.linspace(start, start + 10, num_time_steps)
    data = np.sin(time_steps)
    data = data.reshape(num_time_steps, 1)
    x = torch.tensor(data[:-1]).float().view(1, num_time_steps - 1, 1)
    y = torch.tensor(data[1:]).float().view(1, num_time_steps - 1, 1)
    output, hidden_prev = model(x, hidden_prev)
    hidden_prev = hidden_prev.detach()  # 改requirse_grad为false，切断反向传播
    loss = criterion(output, y)
    model.zero_grad()
    loss.backward()
    # 检查梯度信息
    # for p in model.parameters():
    # print(p.grad.norm())
    # torch.nn.utils.clip_grad_norm_(p, 10)
    optimizer.step()
    if iter % 100 == 0:
        print('Iteration: {} loss {}'.format(iter, loss.item()))
start = np.random.randint(3, size=1)[0]
time_steps = np.linspace(start, start + 10, num_time_steps)
data = np.sin(time_steps)
data = data.reshape(num_time_steps, 1)
x = torch.tensor(data[:-1]).float().view(1, num_time_steps - 1, 1)
y = torch.tensor(data[1:]).float().view(1, num_time_steps - 1, 1)
predictions = []
input = x[:, 0, :]  # [b, seq, feature] => [b, feature]
for _ in range(x.shape[1]):
    input = input.view(1, 1, 1)
    (pred, hidden_prev) = model(input, hidden_prev)
    input = pred    # tensor([[[0.8720]]], grad_fn=<UnsqueezeBackward0>)
    predictions.append(pred.detach().numpy().ravel()[0])
x = x.data.numpy().ravel()
plt.scatter(time_steps[:-1], x.ravel(), s=90)
plt.plot(time_steps[:-1], x.ravel())
plt.scatter(time_steps[1:], predictions)
plt.show()

在这里插入图片描述

44.RNN训练难题

Gradient Exploding
解决办法：Gradient Clipping
Gradient Vanishing
解决办法：LSTM

45.LSTM Layer使用

45.1 nn.LSTM

__init__
(input_size, hidden_size, num_layers)

LSTM.forward()

out, (ht, ct) = lstm(x, [ht_0, ct_0])
x: [seq_len, b, word_vec]
h/c: [num_layers, b, h_dim]
out: [seq_len, b, h_dim]

lstm = nn.LSTM(input_size=100, hidden_size=20, num_layers=4)
print(lstm) # LSTM(100, 20, num_layers=4)
x = torch.rand(10, 3, 100)
out, (h, c) = lstm(x)
print(out.shape, h.shape, c.shape) # torch.Size([10, 3, 20]) torch.Size([4, 3, 20]) torch.Size([4, 3, 20])

45.2 nn.LSTMCell

__init__
(input_size, hidden_size, num_layers)

LSTMCell.forward()

ht, ct = lstmcell(xt, [ht_0, ct_0])
xt: [b, word_vec]
h/c: [ b, h_dim]

Single layer

cell = nn.LSTMCell(input_size=100, hidden_size=20)
print(cell)  # LSTMCell(100, 20)
h = torch.zeros(3, 20)
c = torch.zeros(3, 20)
for xt in x:
    h, c = cell(xt, [h, c])
print(h.shape, c.shape)     # torch.Size([3, 20]) torch.Size([3, 20])

Two layers

cell1 = nn.LSTMCell(input_size=100, hidden_size=30)
cell2 = nn.LSTMCell(input_size=30, hidden_size=20)
h1 = torch.zeros(3, 30)
c1 = torch.zeros(3, 30)
h2 = torch.zeros(3, 20)
c2 = torch.zeros(3, 20)
for xt in x:
    h1, c1 = cell1(xt, [h1, c1])
    h2, c2 = cell2(h1, [h2, c2])
print(h2.shape, c2.shape)   # torch.Size([3, 20]) torch.Size([3, 20])

46.情感分类实战

Google CoLab

Continuous 12 hours
free K80 for GPU
no need to cross GFW

没有谷歌账号不能使用Colaboratory，码一下

import torch
import torch.nn as nn
import numpy as np
from torchtext import data, datasets
# load dataset
TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)
train_data, test_data = datasets.IMDB.split(TEXT, LABEL)
print('len of train data:', len(train_data))
print('len of test data:', len(test_data))
print(train_data.examples[15].text)
print(train_data.examples[15].label)
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(RNN, self).__init__()
        # [0-10001] => [100]
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # [100] => [256]
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, bidirectional=True, dropout=0.5)
        # [25*2] => [1]
        self.fc = nn.Linear(hidden_dim*2, 1)
        self.dropout = nn.Dropout(0.5)
    def forward(self, x):
        # [seq, b, 1] => [seq, b, 100]
        embedding = self.dropout(self.embedding(x))
        # output: [seq, b, hid_dim*2]
        # hidden/h, [num_layer*2, b, hid_dim]
        # cell/c: [num_layer*2, b, hid_dim]
        output, (hidden, cell) = self.rnn(embedding)
        # [num_layer*2, b, hid_dim] => 2 of [b, hid_dim] => [b, hid_dim*2]
        hidden = torch.cat([hidden[-2], hidden[-1]], dim=1)
        # [b, hid_dim*2] => [b, 1]
        hidden = self.dropout(hidden)
        out = self.fc(hidden)
        return out
# load word embedding
rnn = RNN(len(TEXT.vocab), 100, 256)
pretrained_embedding = TEXT.vocab.vectors
print('pretrained_embedding:', pretrained_embedding.shape)
rnn.embedding.weight.data.copy_(pretrained_embedding)
print('embedding layer inited.')
def binary_acc(preds, y):
    preds = torch.round((torch.sigmoid(preds)))
    correct = torch.eq(preds, y).float()
    acc = correct.sum() / len(correct)
    return acc
def eval(rnn, iterator, criteon):
    avg_acc = []
    rnn.eval()
    with torch.no_grad():
        for batch in iterator:
            # [b, 1] => [b]
            pred = rnn(batch.text).squeeze(1)
            loss = criteon(pred, batch.label)
            acc = binary_acc(pred, batch.label).item()
            avg_acc.append(acc)
    avg_acc = np.array(avg_acc).mean()
    print('>>>test:', avg_acc)
def train(rnn, iterator, optimizer, criteon):
    avg_acc = []
    rnn.train()
    for i, batch in enumerate(iterator):
        # [seq, b] => [b, 1] => [b]
        pred = rnn(batch.text).squeeze(1)
        loss = criteon(pred, batch.label)
        acc = binary_acc(pred, batch.label).item()
        avg_acc.append(acc)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
def main():
    pass
if __name__ == '__main__':
    main()