我目前正在研究 Google 的文本分类指南。在第 4 步中,他们创建了一个CNN带有可分离卷积的词嵌入:
def sepcnn_model(blocks,
filters,
kernel_size,
embedding_dim,
dropout_rate,
pool_size,
input_shape,
num_classes,
num_features,
use_pretrained_embedding=False,
is_embedding_trainable=False,
embedding_matrix=None):
"""Creates an instance of a separable CNN model.
# Arguments
blocks: int, number of pairs of sepCNN and pooling blocks in the model.
filters: int, output dimension of the layers.
kernel_size: int, length of the convolution window.
embedding_dim: int, dimension of the embedding vectors.
dropout_rate: float, percentage of input to drop at Dropout layers.
pool_size: int, factor by which to downscale input at MaxPooling layer.
input_shape: tuple, shape of input to the model.
num_classes: int, number of output classes.
num_features: int, number of words (embedding input dimension).
use_pretrained_embedding: bool, true if pre-trained embedding is on.
is_embedding_trainable: bool, true if embedding layer is trainable.
embedding_matrix: dict, dictionary with embedding coefficients.
# Returns
A sepCNN model instance.
"""
op_units, op_activation = _get_last_layer_units_and_activation(num_classes)
model = models.Sequential()
# Add embedding layer. If pre-trained embedding is used add weights to the
# embeddings layer and set trainable to input is_embedding_trainable flag.
if use_pretrained_embedding:
model.add(Embedding(input_dim=num_features,
output_dim=embedding_dim,
input_length=input_shape[0],
weights=[embedding_matrix],
trainable=is_embedding_trainable))
else:
model.add(Embedding(input_dim=num_features,
output_dim=embedding_dim,
input_length=input_shape[0]))
for _ in range(blocks-1):
model.add(Dropout(rate=dropout_rate))
model.add(SeparableConv1D(filters=filters,
kernel_size=kernel_size,
activation='relu',
bias_initializer='random_uniform',
depthwise_initializer='random_uniform',
padding='same'))
model.add(SeparableConv1D(filters=filters,
kernel_size=kernel_size,
activation='relu',
bias_initializer='random_uniform',
depthwise_initializer='random_uniform',
padding='same'))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(SeparableConv1D(filters=filters * 2,
kernel_size=kernel_size,
activation='relu',
bias_initializer='random_uniform',
depthwise_initializer='random_uniform',
padding='same'))
model.add(SeparableConv1D(filters=filters * 2,
kernel_size=kernel_size,
activation='relu',
bias_initializer='random_uniform',
depthwise_initializer='random_uniform',
padding='same'))
model.add(GlobalAveragePooling1D())
model.add(Dropout(rate=dropout_rate))
model.add(Dense(op_units, activation=op_activation))
return model
但是,查看 GitHub 存储库,他们只使用一个通道词嵌入作为输入:
def _get_embedding_matrix(word_index, embedding_data_dir, embedding_dim):
"""Gets embedding matrix from the embedding index data.
# Arguments
word_index: dict, word to index map that was generated from the data.
embedding_data_dir: string, path to the pre-training embeddings.
embedding_dim: int, dimension of the embedding vectors.
# Returns
dict, word vectors for words in word_index from pre-trained embedding.
# References:
https://nlp.stanford.edu/projects/glove/
Download and uncompress archive from:
http://nlp.stanford.edu/data/glove.6B.zip
"""
# Read the pre-trained embedding file and get word to word vector mappings.
embedding_matrix_all = {}
# We are using 200d GloVe embeddings.
fname = os.path.join(embedding_data_dir, 'glove.6B.200d.txt')
with open(fname) as f:
for line in f: # Every line contains word followed by the vector value
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embedding_matrix_all[word] = coefs
# Prepare embedding matrix with just the words in our word_index dictionary
num_words = min(len(word_index) + 1, TOP_K)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
if i >= TOP_K:
continue
embedding_vector = embedding_matrix_all.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
return embedding_matrix
我的问题是:如果嵌入层只有一个通道,他们为什么要使用一个SeparableConv1D层作为他们的第一个层。 Convolution据我了解,与普通卷积相比,可分离卷积仅在应用于多个通道时才提供计算优势?