所以我设法在训练数据和验证数据的组合上运行我的代码,但现在我需要创建一个包含测试数据预测的文本文件,我只是不明白如何。有没有办法让 X_train 与 train_data 和 X_test 与 test_data 一起工作?我认为这可以解决我的问题,但我找不到如何或是否可能。
train_data = np.genfromtxt('train_samples.txt', delimiter = '\t', dtype = None, encoding = 'utf-8', names = ('id', 'text'),
comments = None)
train_labels = np.genfromtxt('train_labels.txt', delimiter='\t', dtype = None, names = ('id', 'label'))
test_data = np.genfromtxt('test_samples.txt', delimiter = '\t', dtype = None, encoding = 'utf-8', names = ('id', 'text'),
comments = None)
validation_data = np.genfromtxt('validation_samples.txt', delimiter='\t', dtype = None, encoding='utf-8',
names = ('id', 'text'), comments = None)
validation_labels = np.genfromtxt('validation_labels.txt', delimiter = '\t', dtype = None, names = ('id', 'label'))
for x in range(len(train_data)):
train_data[x][0] = train_labels[x][1]
for x in range(len(validation_data)):
validation_data[x][0] = validation_labels[x][1]
train_data_text = np.append(train_data['text'], validation_data['text'])
train_data_labels = np.append(train_data['id'], validation_data['id'])
# show shape of training data
cv = CountVectorizer()
word_count_vector = cv.fit_transform(train_data_text)
print(word_count_vector.shape)
# train_data = np.concatenate((train_data, validation_data))
X = cv.fit_transform(train_data_text).toarray()
y = pd.get_dummies(train_data_labels)
y = y.iloc[:, 1].values
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)
# Training model using Naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
results = MultinomialNB().fit(X_train, y_train)
y_pred = results.predict(X_test)
print(y_pred)
from sklearn.metrics import accuracy_score
# Evaluate accuracy
print(accuracy_score(y_test, y_pred))
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)