具有可变序列长度的Tensorflow RNN网络?

数据挖掘 Python 神经网络 回归 张量流 rnn
2022-02-24 00:10:43

我目前正在尝试为回归目的实现 RNN 网络,以便我可以将已知输入映射到已知输出。我的问题是输入没有静态长度,因为它由音频文件的样本组成,并且音频文件的长度不同。但输出长度始终一致,长度为 14。

正是由于输入向量的这种不一致,我决定首先使用 RNN。

我目前正在使用 tensorflow,似乎没有很好的方法来处理这个问题。我尝试了基于这篇文章的黑客解决方案,以某种方式以与开始时相同的问题结束。

这是我的实现:

import tensorflow as tf
from tensorflow.models.rnn import rnn_cell
from tensorflow.models.rnn import rnn
import numpy as np
import librosa
import glob
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
import os
from os import walk
from os.path import splitext
from os.path import join
import time
rng = np.random
np.set_printoptions(threshold=np.nan)
import functools

start_time = time.time()

print "Preprocessing"

def lazy_property(function):
    attribute = '_' + function.__name__

    @property
    @functools.wraps(function)
    def wrapper(self):
        if not hasattr(self, attribute):
            setattr(self, attribute, function(self))
        return getattr(self, attribute)
    return wrapper
## Class definition ##
class VariableSequenceLabelling:

    def __init__(self, data, target, num_hidden=200, num_layers=3):
        self.data = data
        self.target = target
        self._num_hidden = num_hidden
        self._num_layers = num_layers
        self.prediction
        self.error
        self.optimize

    @lazy_property
    def length(self):
        used = tf.sign(tf.reduce_max(tf.abs(self.data), reduction_indices=2))
        length = tf.reduce_sum(used, reduction_indices=1)
        length = tf.cast(length, tf.int32)
        return length

    @lazy_property
    def prediction(self):
        # Recurrent network.
        output, _ = tf.nn.dynamic_rnn(
             rnn_cell.GRUCell(self._num_hidden),
            self.data,
            dtype=tf.float32,
            sequence_length=self.length,
        )
        # Softmax layer.
        max_length = int(self.target.get_shape()[1])
        num_classes = int(self.target.get_shape()[2])
        weight, bias = self._weight_and_bias(self._num_hidden, num_classes)
        # Flatten to apply same weights to all time steps.
        output = tf.reshape(output, [-1, self._num_hidden])
        prediction = tf.nn.softmax(tf.matmul(output, weight) + bias)
        prediction = tf.reshape(prediction, [-1, max_length, num_classes])
        return prediction

    @lazy_property
    def cost(self):
        # Compute cross entropy for each frame.
        cross_entropy = self.target * tf.log(self.prediction)
        cross_entropy = -tf.reduce_sum(cross_entropy, reduction_indices=2)
        mask = tf.sign(tf.reduce_max(tf.abs(self.target), reduction_indices=2))
        cross_entropy *= mask
        # Average over actual sequence lengths.
        cross_entropy = tf.reduce_sum(cross_entropy, reduction_indices=1)
        cross_entropy /= tf.cast(self.length, tf.float32)
        return tf.reduce_mean(cross_entropy)

    @lazy_property
    def optimize(self):
        learning_rate = 0.0003
        optimizer = tf.train.AdamOptimizer(learning_rate)
        return optimizer.minimize(self.cost)

    @lazy_property
    def error(self):
        mistakes = tf.not_equal(
            tf.argmax(self.target, 2), tf.argmax(self.prediction, 2))
        mistakes = tf.cast(mistakes, tf.float32)
        mask = tf.sign(tf.reduce_max(tf.abs(self.target), reduction_indices=2))
        mistakes *= mask
        # Average over actual sequence lengths.
        mistakes = tf.reduce_sum(mistakes, reduction_indices=1)
        mistakes /= tf.cast(self.length, tf.float32)
        return tf.reduce_mean(mistakes)

    @staticmethod
    def _weight_and_bias(in_size, out_size):
        weight = tf.truncated_normal([in_size, out_size], stddev=0.01)
        bias = tf.constant(0.1, shape=[out_size])
        return tf.Variable(weight), tf.Variable(bias)

#######################
#Converting file to .wav from .sph file format... God dammit!!!

#with open(train_filelist, 'r') as train_filelist, open(test_filelist, 'r') as test_filelist:
    #train_mylist = train_filelist.read().splitlines()
    #test_mylist = test_filelist.read().splitlines()
    #for line in train_mylist:
        #new_line = ' '.join(reversed(line))
        #index_start = new_line.find('h')
        #index_end = new_line.find('/')
        #edited_line = ''.join(reversed(new_line[index_start+5:index_end])).strip().replace(" ","")
        #new_file = edited_line + 'wav'
        #os.system(line + ' >> ' + dnn_train + new_file)
    #for line in test_mylist:
        #new_line = ' '.join(reversed(line))
        #index_start = new_line.find('h')
        #index_end = new_line.find('/')
        #edited_line = ''.join(reversed(new_line[index_start+5:index_end])).strip().replace(" ","")
        #new_file = edited_line + 'wav'
        #os.system(line + ' >> ' + dnn_test + new_file)


path_train =  "/home/JoeS/kaldi-trunk/egs/start/s5/data/train"
path_test =  "/home/JoeS/kaldi-trunk/egs/start/s5/data/test"
dnn_train = "/home/JoeS/kaldi-trunk/dnn/train/"
dnn_test = "/home/JoeS/kaldi-trunk/dnn/test/"
dnn = "/home/JoeS/kaldi-trunk/dnn/"
path  = "/home/JoeS/kaldi-trunk/egs/start/s5/data/"
MFCC_dir = "/home/JoeS/kaldi-trunk/egs/start/s5/mfcc/raw_mfcc_train.txt"

train_filelist = path_train+"/wav_train.txt"
test_filelist = path_test+"/wav_test.txt"

os.chdir(path)

def binify(number):
    divider = (36471330-10533580)/6
    if number >= divider*0 and number < divider*1:
        return 1
    if number >= divider*1 and number < divider*2:
        return 2
    if number >= divider*2 and number < divider*3:
        return 3
    if number >= divider*3 and number < divider*4:
        return 4
    if number >= divider*5 and number < divider*6:
        return 5
    if number >= divider*6:
        return 6

def find_all(a_str, sub):
    start = 0
    while True:
        start = a_str.find(sub, start)
        if start == -1: return
        yield start
        start += len(sub) # use start += 1 to find overlapping matches

def load_sound_files(file_paths ,  names_input, data_input):
    raw_sounds = []
    names_output = []
    data_output = []
    class_output = []
    for fp in file_paths:
        X,sr = librosa.load(fp)
        raw_sounds.append(X)
        index = list(find_all(fp,'-'))
        input_index = names_input.index(fp[index[1]+1:index[2]])
        names_output.append(names_input[input_index])
        data_output.append(data_input[input_index])
        class_output.append(binify(data_input[input_index][0]))
    return raw_sounds, names_output, data_output, class_output

def generate_list_of_names_data(file_path):
    # Proprocess
    # extract name and data
    name = []
    data = []
    with open(MFCC_dir) as mfcc_feature_list:
        content = [x.strip('\n') for x in mfcc_feature_list.readlines()] # remove endlines
        start_index_data = 0
        end_index_data = 2
        for number in range(0,42):
            start = list(find_all(content[start_index_data],'['))[0]
            end = list(find_all(content[end_index_data],']'))[0]
            end_name = list(find_all(content[start_index_data],' '))[0]
            substring_data = content[start_index_data][start+1 :]+content[end_index_data][: end]
            substring_name = content[start_index_data][:end_name]
            arr = np.array(substring_data.split(), dtype = float)
            data.append(arr)
            name.append(substring_name)
            start_index_data = start_index_data + +3
            end_index_data = end_index_data +3
    return name, data

files_train_path = [dnn_train+f for f in listdir(dnn_train) if isfile(join(dnn_train, f))]
files_test_path = [dnn_test+f for f in listdir(dnn_test) if isfile(join(dnn_test, f))]

files_train_name = [f for f in listdir(dnn_train) if isfile(join(dnn_train, f))]
files_test_name = [f for f in listdir(dnn_test) if isfile(join(dnn_test, f))]

os.chdir(dnn_train)

train_name,train_data = generate_list_of_names_data(files_train_path)
train_data, train_names, train_output_data, train_class_output = load_sound_files(files_train_path,train_name,train_data)

max_length = 0 ## Used for variable sequence input

for element in train_data:
    if element.size > max_length:
        max_length = element.size

NUM_EXAMPLES = len(train_data)/2

test_data = train_data[NUM_EXAMPLES:]
test_output = train_output_data[NUM_EXAMPLES:]

train_data = train_data[:NUM_EXAMPLES]
train_output = train_output_data[:NUM_EXAMPLES]
print("--- %s seconds ---" % (time.time() - start_time))
##-------------------MAIN----------------------------##

if __name__ == '__main__':
    data = tf.placeholder(tf.float32, [None, max_length, 1])
    target = tf.placeholder(tf.float32, [None, 14, 1])
    model = VariableSequenceLabelling(data, target)
    sess = tf.Session()
    sess.run(tf.initialize_all_variables())
    for epoch in range(10):
        for sample_set in range(100):
            batch_train = train_data[sample_set]
            batch_target = train_output[sample_set]
            sess.run(model.optimize, {data: batch_train, target: batch_target})
        test_set = test_data[epoch]
        test_set_output = test_output[epoch]
        error = sess.run(model.error, {data: test_set, target: test_set_output})
        print('Epoch {:2d} error {:3.1f}%'.format(epoch + 1, 100 * error))

我收到的错误消息是

Traceback (most recent call last):
  File "tensorflow_datapreprocess_mfcc_extraction_rnn.py", line 239, in <module>
    sess.run(model.optimize, {data: batch_train, target: batch_target})
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 340, in run
    run_metadata_ptr)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 553, in _run
    % (np_val.shape, subfeed_t.name, str(subfeed_t.get_shape())))
ValueError: Cannot feed value of shape (63945,) for Tensor u'Placeholder:0', which has shape '(?, 138915, 1)'

我猜调试更像是一个stackoverflow问题而不是数据科学问题,所以我不想寻求帮助解决我关于代码的问题,我想知道是否有其他框架本身支持这个,所以我不需要破解我的解决方案。我添加了我的代码,这样你就知道我的输入和输出数据的结构。如果我能保留这个结构,将不胜感激。

1个回答

由于张量只能是固定大小,因此您必须将序列(通常从左侧)零填充到最大出现长度,例如:

max_len = max([len(x) for x in sequences])
sequences = [np.pad(x, (max_len - len(x), 0), 'constant', constant_values = (0.,0.)) for x in sequences]