📅  最后修改于: 2023-12-03 15:14:11.100000             🧑  作者: Mango
CNTK(微软认知工具包)是一个深度学习框架,可用来创建和训练不同类型的神经网络模型。本文将细讲如何使用CNTK进行序列分类任务,这是自然语言处理(NLP)中广泛应用的一个任务。
序列分类任务将一个输入序列映射到一个输出标签或序列。在NLP中,输入序列通常是单词或字符的序列,输出标签是文本分类或命名实体识别等任务的预测标签。本文将演示使用CNTK训练一个简单的LSTM模型进行情感分析。
首先,我们需要准备分类数据集。这里我们使用IMDB情感分析数据集,该数据集包含两个文件夹,分别存储了训练集和测试集数据。每个数据文件包含一条评论和标签(0表示负面,1表示正面)。
我们需要对数据文件进行解析和预处理,将评论转化为特征向量,并对标签进行独热编码。
import os
import numpy as np
from cntk.contrib import sequence_to_sequence
np.random.seed(123)
def parse_imdb_data(filename):
with open(filename, 'r', encoding='utf-8') as f:
data = f.read().split('\n')
data = [d.split('\t') for d in data if len(d.split('\t')) == 2]
texts = [d[0] for d in data]
labels = [int(d[1]) for d in data]
return texts, np.eye(2)[labels].astype(np.float32)
def prepare_imdb_data(imdb_folder):
train_texts, train_labels = parse_imdb_data(os.path.join(imdb_folder, 'train.txt'))
test_texts, test_labels = parse_imdb_data(os.path.join(imdb_folder, 'test.txt'))
return train_texts, train_labels, test_texts, test_labels
注意,CNTK中输入数据的形式是由序列列表组成的列表,其中每个序列表示一个具有可变长度的特征向量列表。因此,我们需要对所有评论进行分词和填充,以确保所有评论的长度相同。
import re
import spacy
nlp = spacy.load('en_core_web_sm')
# 定义填充和截断comment的函数
def pad_comment(comment, max_length):
return comment[:max_length] + [''] * max(0, max_length - len(comment))
def trim_comment(comment, max_length):
return comment[:max_length]
def tokenize_comment(comment):
comment = re.sub('[^a-zA-Z0-9.\'\"\-:\)\(]', ' ', comment)
comment = nlp(comment)
tokens = [token.text for token in comment]
return tokens
def preprocess_imdb_data(train_texts, test_texts, max_length):
train_texts = [tokenize_comment(comment) for comment in train_texts]
test_texts = [tokenize_comment(comment) for comment in test_texts]
train_texts = [trim_comment(comment, max_length) for comment in train_texts]
test_texts = [trim_comment(comment, max_length) for comment in test_texts]
#获取不同comment长度
lengths = [len(comment) for comment in train_texts + test_texts]
#取最常见的50个长度
length_counts = np.unique(lengths, return_counts=True)
common_lengths = length_counts[0][np.argsort(-length_counts[1])][:50]
#取训练集中的comment长度为长度不足50个的common_lengths,再进行填充
train_texts = [pad_comment(comment, max_length) for comment, length in zip(train_texts, lengths)
if length in common_lengths]
train_labels = train_labels[lengths < max_length + 1][lengths[lengths < max_length + 1] in common_lengths]
#将测试集的长度为长度不足50个的common_lengths,再进行填充
test_texts = [pad_comment(comment, max_length) for comment, length in zip(test_texts, lengths)
if length in common_lengths]
test_labels = test_labels[lengths < max_length + 1][lengths[lengths < max_length + 1] in common_lengths]
#将每个comment中的单词转化为id
word_counts = dict()
for comment in train_texts:
for word in comment:
if word not in word_counts:
word_counts[word] = 0
word_counts[word] += 1
vocab = sorted(list(word_counts.keys()), key=lambda x: -word_counts[x])
word_to_idx = dict([(word, i + 2) for i, word in enumerate(vocab)])
word_to_idx['<PAD>'] = 0
word_to_idx['<UNK>'] = 1
idx_to_word = dict([(i, word) for word, i in word_to_idx.items()])
train_data = [[word_to_idx.get(word, 1) for word in comment] for comment in train_texts]
test_data = [[word_to_idx.get(word, 1) for word in comment] for comment in test_texts]
return train_data, train_labels, test_data, test_labels, word_to_idx, idx_to_word
在数据预处理完成后,我们需要构建模型。这里我们使用多层LSTM模型,其输出隐层向量将通过sigmoid函数进行预测,然后将预测值与标签进行比较并计算误差。
注意序列分类模型的输出应该是一个标量,因此需要在最后一个LSTM层和sigmoid层之间添加平均池化层。
import cntk as C
def create_lstm_model(input_dim, hidden_dim, output_dim):
x = C.input_variable(shape=input_dim, dtype='int64', name='features')
y = C.input_variable(shape=output_dim, dtype='float32', name='labels')
# 嵌入层将单词id映射到单词向量
embed = C.layers.Embedding(emb_dim=hidden_dim, name='embed')(x)
#双向LSTM
lstm_fwd = C.layers.Recurrence(C.layers.LSTM(hidden_dim))(embed)
lstm_bwd = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=True)(embed)
lstm = C.splice(lstm_fwd, lstm_bwd)
#在LSTM的顶部添加平均池化层
pooled = C.layers.AveragePooling(filter_shape=(max_length,), stride=(1,), pad=True)(lstm)
# 使用sigmoid函数进行预测
# 平均池化的输出维度是(hidden_dim * 2), 因此需要在输出层中进行降维
h = C.layers.Dense(hidden_dim, activation=C.relu)(pooled)
y_pred = C.layers.Dense(output_dim, activation=C.sigmoid, name='output')(h)
loss = C.binary_cross_entropy(y_pred, y)
error = C.classification_error(y_pred, y)
return x, y, y_pred, loss, error
模型完成后,我们需要准备数据并训练模型。这需要指定一些超参数,如学习率、隐藏层大小等。下面是一个简单的训练代码示例:
if __name__ == '__main__':
imdb_folder = '/path/to/imdb/folder'
max_length = 50
hidden_dim = 128
learning_rate = 0.01
batch_size = 32
num_epochs = 10
train_texts, train_labels, test_texts, test_labels = prepare_imdb_data(imdb_folder)
train_data, train_labels, test_data, test_labels, word_to_idx, idx_to_word = preprocess_imdb_data(
train_texts, test_texts, max_length)
input_dim = (len(word_to_idx),)
x, y, y_pred, loss, error = create_lstm_model(input_dim, hidden_dim, 2)
learner = C.fsadagrad(y_pred.parameters, lr=learning_rate)
trainer = C.Trainer(y_pred, loss, error, [learner])
train_sequences = sequence_to_sequence(train_data, train_labels, batch_size=batch_size)
test_sequences = sequence_to_sequence(test_data, test_labels, batch_size=batch_size)
for epoch in range(num_epochs):
epoch_loss = 0
epoch_error = 0
num_batches = 0
for x_batch, y_batch in train_sequences:
trainer.train_minibatch({x: x_batch, y: y_batch})
epoch_loss += trainer.previous_minibatch_loss_average
epoch_error += trainer.previous_minibatch_evaluation_average
num_batches += 1
print("Epoch {0}: loss = {1:.4f}, error = {2:.2f}%".format(epoch + 1,
epoch_loss / num_batches,
epoch_error / num_batches * 100))
#测试模型
test_loss = 0
test_error = 0
test_batches = 0
for x_batch, y_batch in test_sequences:
test_loss += trainer.test_minibatch({x: x_batch, y: y_batch}).average_loss
test_error += trainer.test_minibatch({x: x_batch, y: y_batch}).evaluation_criteria[1]
test_batches += 1
print("Test loss = {0:.4f}, test error = {1:.2f}%".format(test_loss / test_batches,
test_error / test_batches * 100))
CNTK是一个强大的深度学习框架,可以用来处理序列分类任务。本文演示了如何在CNTK中实现情感分析任务。实际上,CNTK的功能远不止于此,它还可以用于图像识别、语音识别等各类任务。如果你还没有尝试过CNTK,为什么不现在就开始呢?