数据挖掘 - 从 BERT 输出中删除子词 - 吾爱随笔录

我正在尝试构建一个以 BERT 作为嵌入层的多语言 WSD 系统。为了获得更好的性能，在 BERT 完成工作（并执行迁移学习）后，我需要从其输出中删除子词。有没有办法这样做？
我试图将模型从网络架构中分离出来，做这样的事情......但我需要把它作为一个自定义层来做，我不能 100% 确定这是否正确

class Bert:
    def __init__(self):
        input_word_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="input_word_ids")
        input_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="input_mask")
        segment_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="segment_ids")
        print("dopwnloading BERT...")
        bert = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/1", trainable=False, name="BERT")
        print("BERT downloaded")
        pooled_output, sequence_output = bert([input_word_ids, input_mask, segment_ids])
        self.model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output])
        self.model.summary()


    def predict(self, input_word_ids, input_mask, segment_ids, positional_ids,needed_padding, train_mode:bool = False):
        print("Starting BERT prediction...")
        pool_embs, all_embs = self.model.predict(
            {'input_word_ids': input_word_ids, 'input_mask': input_mask, 'segment_ids': segment_ids},
            verbose=1,
            batch_size=64
        )
        del pool_embs
        to_return = []
        print("Conversion\nSoftware version 2.0...")
        for i in tqdm(range(len(positional_ids))):
            indexes_to_extrapolate = np.concatenate((positional_ids[i],needed_padding[i]))
            indexes_to_extrapolate = indexes_to_extrapolate[:63] if len(indexes_to_extrapolate) > 64 else indexes_to_extrapolate
            new_version = tf.gather(all_embs[i], tf.constant(indexes_to_extrapolate))
            if train_mode and new_version.shape[0] < 64:
                #Means that, originally, there has to be a padding!
                #And, if there is, it can surely be found in the first position of the needed_padding!
                how_much_iteration = 64 - new_version.shape[0]
                if how_much_iteration > 0:
                    for iteratore in range(how_much_iteration):
                        tmp_padding_for_iteration = needed_padding[i][0]
                        new_version = tf.concat([new_version, tf.constant(all_embs[i][tmp_padding_for_iteration], shape=(1,768))], 0)
            with open("registro_shape.txt","a") as registro:
                registro.write("Shape --> " +str(new_version.shape)+"\n")
            if new_version.shape[0] > 64:
                print("wth")
            to_return.append(new_version)
        return tf.stack(to_return)

编辑：我将尝试使用有关网络体系结构的更多信息将案例背景化。特别是，这是我尝试为 WSD 任务构建的网络架构。请注意，网络应该执行多任务学习任务：

伯特
BiLSTM
注意力层
3个输出层

self.tokenizatore = FullTokenizer(bert_path,do_lower_case=False)

input_word_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32,name="input_word_ids")

input_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32,name="input_mask")

segment_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32,name="segment_ids")

print("dopwnloading BERT...")
bert = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/1", trainable=False)
print("BERT downloaded")
pooled_output, sequence_output = bert([input_word_ids, input_mask, segment_ids])
LSTM = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(
        units=hidden_size,
        dropout=dropout,
        recurrent_dropout=recurrent_dropout,
        return_sequences=True,
        return_state=True
    )
)(sequence_output)
LSTM = self.produce_attention_layer(LSTM)
LSTM = tf.keras.layers.Dropout(0.5)(LSTM)

babelnet_output = tf.keras.layers.Dense(outputs_size[0], activation="softmax", name="babelnet")(LSTM)
domain_output = tf.keras.layers.Dense(outputs_size[1], activation="softmax", name="domain")(LSTM)
lexicon_output = tf.keras.layers.Dense(outputs_size[2], activation="softmax", name="lexicon")(LSTM)



def produce_attention_layer(self, LSTM):
    """
    Produces an Attention Layer like the one mentioned in the Raganato et al. Neural Sequence Learning Models for Word Sense Disambiguation,
    chapter 3.2
    :param lstm: The LSTM that will be used in the task
    :return: The LSTM that was previously given in input with the enhancement of the Attention Layer
    """
    hidden_states = tf.keras.layers.Concatenate()([LSTM[1],LSTM[3]])
    ripetitore = tf.keras.layers.RepeatVector(tf.keras.backend.shape(LSTM[0])[1])(hidden_states)
    u = tf.keras.layers.Dense(1, activation="tanh")(ripetitore)
    attivazione = tf.keras.layers.Activation('softmax')(u)  # We are using a custom softmax(axis = 1) loaded in this notebook
    dotor = tf.keras.layers.Multiply()([LSTM[0],attivazione])

    return dotor