我目前正在尝试为回归目的实现 RNN 网络,以便我可以将已知输入映射到已知输出。我的问题是输入没有静态长度,因为它由音频文件的样本组成,并且音频文件的长度不同。但输出长度始终一致,长度为 14。
正是由于输入向量的这种不一致,我决定首先使用 RNN。
我目前正在使用 tensorflow,似乎没有很好的方法来处理这个问题。我尝试了基于这篇文章的黑客解决方案,以某种方式以与开始时相同的问题结束。
这是我的实现:
import tensorflow as tf
from tensorflow.models.rnn import rnn_cell
from tensorflow.models.rnn import rnn
import numpy as np
import librosa
import glob
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
import os
from os import walk
from os.path import splitext
from os.path import join
import time
rng = np.random
np.set_printoptions(threshold=np.nan)
import functools
start_time = time.time()
print "Preprocessing"
def lazy_property(function):
attribute = '_' + function.__name__
@property
@functools.wraps(function)
def wrapper(self):
if not hasattr(self, attribute):
setattr(self, attribute, function(self))
return getattr(self, attribute)
return wrapper
## Class definition ##
class VariableSequenceLabelling:
def __init__(self, data, target, num_hidden=200, num_layers=3):
self.data = data
self.target = target
self._num_hidden = num_hidden
self._num_layers = num_layers
self.prediction
self.error
self.optimize
@lazy_property
def length(self):
used = tf.sign(tf.reduce_max(tf.abs(self.data), reduction_indices=2))
length = tf.reduce_sum(used, reduction_indices=1)
length = tf.cast(length, tf.int32)
return length
@lazy_property
def prediction(self):
# Recurrent network.
output, _ = tf.nn.dynamic_rnn(
rnn_cell.GRUCell(self._num_hidden),
self.data,
dtype=tf.float32,
sequence_length=self.length,
)
# Softmax layer.
max_length = int(self.target.get_shape()[1])
num_classes = int(self.target.get_shape()[2])
weight, bias = self._weight_and_bias(self._num_hidden, num_classes)
# Flatten to apply same weights to all time steps.
output = tf.reshape(output, [-1, self._num_hidden])
prediction = tf.nn.softmax(tf.matmul(output, weight) + bias)
prediction = tf.reshape(prediction, [-1, max_length, num_classes])
return prediction
@lazy_property
def cost(self):
# Compute cross entropy for each frame.
cross_entropy = self.target * tf.log(self.prediction)
cross_entropy = -tf.reduce_sum(cross_entropy, reduction_indices=2)
mask = tf.sign(tf.reduce_max(tf.abs(self.target), reduction_indices=2))
cross_entropy *= mask
# Average over actual sequence lengths.
cross_entropy = tf.reduce_sum(cross_entropy, reduction_indices=1)
cross_entropy /= tf.cast(self.length, tf.float32)
return tf.reduce_mean(cross_entropy)
@lazy_property
def optimize(self):
learning_rate = 0.0003
optimizer = tf.train.AdamOptimizer(learning_rate)
return optimizer.minimize(self.cost)
@lazy_property
def error(self):
mistakes = tf.not_equal(
tf.argmax(self.target, 2), tf.argmax(self.prediction, 2))
mistakes = tf.cast(mistakes, tf.float32)
mask = tf.sign(tf.reduce_max(tf.abs(self.target), reduction_indices=2))
mistakes *= mask
# Average over actual sequence lengths.
mistakes = tf.reduce_sum(mistakes, reduction_indices=1)
mistakes /= tf.cast(self.length, tf.float32)
return tf.reduce_mean(mistakes)
@staticmethod
def _weight_and_bias(in_size, out_size):
weight = tf.truncated_normal([in_size, out_size], stddev=0.01)
bias = tf.constant(0.1, shape=[out_size])
return tf.Variable(weight), tf.Variable(bias)
#######################
#Converting file to .wav from .sph file format... God dammit!!!
#with open(train_filelist, 'r') as train_filelist, open(test_filelist, 'r') as test_filelist:
#train_mylist = train_filelist.read().splitlines()
#test_mylist = test_filelist.read().splitlines()
#for line in train_mylist:
#new_line = ' '.join(reversed(line))
#index_start = new_line.find('h')
#index_end = new_line.find('/')
#edited_line = ''.join(reversed(new_line[index_start+5:index_end])).strip().replace(" ","")
#new_file = edited_line + 'wav'
#os.system(line + ' >> ' + dnn_train + new_file)
#for line in test_mylist:
#new_line = ' '.join(reversed(line))
#index_start = new_line.find('h')
#index_end = new_line.find('/')
#edited_line = ''.join(reversed(new_line[index_start+5:index_end])).strip().replace(" ","")
#new_file = edited_line + 'wav'
#os.system(line + ' >> ' + dnn_test + new_file)
path_train = "/home/JoeS/kaldi-trunk/egs/start/s5/data/train"
path_test = "/home/JoeS/kaldi-trunk/egs/start/s5/data/test"
dnn_train = "/home/JoeS/kaldi-trunk/dnn/train/"
dnn_test = "/home/JoeS/kaldi-trunk/dnn/test/"
dnn = "/home/JoeS/kaldi-trunk/dnn/"
path = "/home/JoeS/kaldi-trunk/egs/start/s5/data/"
MFCC_dir = "/home/JoeS/kaldi-trunk/egs/start/s5/mfcc/raw_mfcc_train.txt"
train_filelist = path_train+"/wav_train.txt"
test_filelist = path_test+"/wav_test.txt"
os.chdir(path)
def binify(number):
divider = (36471330-10533580)/6
if number >= divider*0 and number < divider*1:
return 1
if number >= divider*1 and number < divider*2:
return 2
if number >= divider*2 and number < divider*3:
return 3
if number >= divider*3 and number < divider*4:
return 4
if number >= divider*5 and number < divider*6:
return 5
if number >= divider*6:
return 6
def find_all(a_str, sub):
start = 0
while True:
start = a_str.find(sub, start)
if start == -1: return
yield start
start += len(sub) # use start += 1 to find overlapping matches
def load_sound_files(file_paths , names_input, data_input):
raw_sounds = []
names_output = []
data_output = []
class_output = []
for fp in file_paths:
X,sr = librosa.load(fp)
raw_sounds.append(X)
index = list(find_all(fp,'-'))
input_index = names_input.index(fp[index[1]+1:index[2]])
names_output.append(names_input[input_index])
data_output.append(data_input[input_index])
class_output.append(binify(data_input[input_index][0]))
return raw_sounds, names_output, data_output, class_output
def generate_list_of_names_data(file_path):
# Proprocess
# extract name and data
name = []
data = []
with open(MFCC_dir) as mfcc_feature_list:
content = [x.strip('\n') for x in mfcc_feature_list.readlines()] # remove endlines
start_index_data = 0
end_index_data = 2
for number in range(0,42):
start = list(find_all(content[start_index_data],'['))[0]
end = list(find_all(content[end_index_data],']'))[0]
end_name = list(find_all(content[start_index_data],' '))[0]
substring_data = content[start_index_data][start+1 :]+content[end_index_data][: end]
substring_name = content[start_index_data][:end_name]
arr = np.array(substring_data.split(), dtype = float)
data.append(arr)
name.append(substring_name)
start_index_data = start_index_data + +3
end_index_data = end_index_data +3
return name, data
files_train_path = [dnn_train+f for f in listdir(dnn_train) if isfile(join(dnn_train, f))]
files_test_path = [dnn_test+f for f in listdir(dnn_test) if isfile(join(dnn_test, f))]
files_train_name = [f for f in listdir(dnn_train) if isfile(join(dnn_train, f))]
files_test_name = [f for f in listdir(dnn_test) if isfile(join(dnn_test, f))]
os.chdir(dnn_train)
train_name,train_data = generate_list_of_names_data(files_train_path)
train_data, train_names, train_output_data, train_class_output = load_sound_files(files_train_path,train_name,train_data)
max_length = 0 ## Used for variable sequence input
for element in train_data:
if element.size > max_length:
max_length = element.size
NUM_EXAMPLES = len(train_data)/2
test_data = train_data[NUM_EXAMPLES:]
test_output = train_output_data[NUM_EXAMPLES:]
train_data = train_data[:NUM_EXAMPLES]
train_output = train_output_data[:NUM_EXAMPLES]
print("--- %s seconds ---" % (time.time() - start_time))
##-------------------MAIN----------------------------##
if __name__ == '__main__':
data = tf.placeholder(tf.float32, [None, max_length, 1])
target = tf.placeholder(tf.float32, [None, 14, 1])
model = VariableSequenceLabelling(data, target)
sess = tf.Session()
sess.run(tf.initialize_all_variables())
for epoch in range(10):
for sample_set in range(100):
batch_train = train_data[sample_set]
batch_target = train_output[sample_set]
sess.run(model.optimize, {data: batch_train, target: batch_target})
test_set = test_data[epoch]
test_set_output = test_output[epoch]
error = sess.run(model.error, {data: test_set, target: test_set_output})
print('Epoch {:2d} error {:3.1f}%'.format(epoch + 1, 100 * error))
我收到的错误消息是
Traceback (most recent call last):
File "tensorflow_datapreprocess_mfcc_extraction_rnn.py", line 239, in <module>
sess.run(model.optimize, {data: batch_train, target: batch_target})
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 340, in run
run_metadata_ptr)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 553, in _run
% (np_val.shape, subfeed_t.name, str(subfeed_t.get_shape())))
ValueError: Cannot feed value of shape (63945,) for Tensor u'Placeholder:0', which has shape '(?, 138915, 1)'
我猜调试更像是一个stackoverflow问题而不是数据科学问题,所以我不想寻求帮助解决我关于代码的问题,我想知道是否有其他框架本身支持这个,所以我不需要破解我的解决方案。我添加了我的代码,这样你就知道我的输入和输出数据的结构。如果我能保留这个结构,将不胜感激。