我正在尝试构建一个以 BERT 作为嵌入层的多语言 WSD 系统。为了获得更好的性能,在 BERT 完成工作(并执行迁移学习)后,我需要从其输出中删除子词。有没有办法这样做?
我试图将模型从网络架构中分离出来,做这样的事情......但我需要把它作为一个自定义层来做,我不能 100% 确定这是否正确
class Bert:
def __init__(self):
input_word_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="segment_ids")
print("dopwnloading BERT...")
bert = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/1", trainable=False, name="BERT")
print("BERT downloaded")
pooled_output, sequence_output = bert([input_word_ids, input_mask, segment_ids])
self.model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output])
self.model.summary()
def predict(self, input_word_ids, input_mask, segment_ids, positional_ids,needed_padding, train_mode:bool = False):
print("Starting BERT prediction...")
pool_embs, all_embs = self.model.predict(
{'input_word_ids': input_word_ids, 'input_mask': input_mask, 'segment_ids': segment_ids},
verbose=1,
batch_size=64
)
del pool_embs
to_return = []
print("Conversion\nSoftware version 2.0...")
for i in tqdm(range(len(positional_ids))):
indexes_to_extrapolate = np.concatenate((positional_ids[i],needed_padding[i]))
indexes_to_extrapolate = indexes_to_extrapolate[:63] if len(indexes_to_extrapolate) > 64 else indexes_to_extrapolate
new_version = tf.gather(all_embs[i], tf.constant(indexes_to_extrapolate))
if train_mode and new_version.shape[0] < 64:
#Means that, originally, there has to be a padding!
#And, if there is, it can surely be found in the first position of the needed_padding!
how_much_iteration = 64 - new_version.shape[0]
if how_much_iteration > 0:
for iteratore in range(how_much_iteration):
tmp_padding_for_iteration = needed_padding[i][0]
new_version = tf.concat([new_version, tf.constant(all_embs[i][tmp_padding_for_iteration], shape=(1,768))], 0)
with open("registro_shape.txt","a") as registro:
registro.write("Shape --> " +str(new_version.shape)+"\n")
if new_version.shape[0] > 64:
print("wth")
to_return.append(new_version)
return tf.stack(to_return)
编辑:我将尝试使用有关网络体系结构的更多信息将案例背景化。特别是,这是我尝试为 WSD 任务构建的网络架构。请注意,网络应该执行多任务学习任务:
- 伯特
- BiLSTM
- 注意力层
- 3个输出层
self.tokenizatore = FullTokenizer(bert_path,do_lower_case=False)
input_word_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32,name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32,name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32,name="segment_ids")
print("dopwnloading BERT...")
bert = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/1", trainable=False)
print("BERT downloaded")
pooled_output, sequence_output = bert([input_word_ids, input_mask, segment_ids])
LSTM = tf.keras.layers.Bidirectional(
tf.keras.layers.LSTM(
units=hidden_size,
dropout=dropout,
recurrent_dropout=recurrent_dropout,
return_sequences=True,
return_state=True
)
)(sequence_output)
LSTM = self.produce_attention_layer(LSTM)
LSTM = tf.keras.layers.Dropout(0.5)(LSTM)
babelnet_output = tf.keras.layers.Dense(outputs_size[0], activation="softmax", name="babelnet")(LSTM)
domain_output = tf.keras.layers.Dense(outputs_size[1], activation="softmax", name="domain")(LSTM)
lexicon_output = tf.keras.layers.Dense(outputs_size[2], activation="softmax", name="lexicon")(LSTM)
def produce_attention_layer(self, LSTM):
"""
Produces an Attention Layer like the one mentioned in the Raganato et al. Neural Sequence Learning Models for Word Sense Disambiguation,
chapter 3.2
:param lstm: The LSTM that will be used in the task
:return: The LSTM that was previously given in input with the enhancement of the Attention Layer
"""
hidden_states = tf.keras.layers.Concatenate()([LSTM[1],LSTM[3]])
ripetitore = tf.keras.layers.RepeatVector(tf.keras.backend.shape(LSTM[0])[1])(hidden_states)
u = tf.keras.layers.Dense(1, activation="tanh")(ripetitore)
attivazione = tf.keras.layers.Activation('softmax')(u) # We are using a custom softmax(axis = 1) loaded in this notebook
dotor = tf.keras.layers.Multiply()([LSTM[0],attivazione])
return dotor