我正在对一些数据进行回归分析。我不断得到非常高的训练分数和低的测试分数。我的代码如下,我能做些什么来增强它?先感谢您。
# coding: utf-8
# In[1]:
#Importing modules
import sys
import math
import itertools
import numpy as np
import pandas as pd
from numpy import genfromtxt
from matplotlib import style
import matplotlib.pyplot as plt
from sklearn import linear_model
from matplotlib import style, figure
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
# In[2]:
#Importing data
df = np.genfromtxt('/Users/Studies/Machine_learning/reactivity/main_us.csv', delimiter=',')
#To skip the header ad skiprpws=0
# In[3]:
X = df[0:,1:306]
y = df[0:,0]
# In[4]:
print (X).shape
print (y).shape
display (X)
display (y)
print (y)
# In[5]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=4)
# In[6]:
#Apply StandardScaler for feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)
print len(X_test), len(y_test)
# In[7]:
#Applying PCA for dimnetionality reduction
from sklearn.decomposition import PCA
pca = PCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
#Checking shape after scaling
print ("Checking shape after scaling")
print (X_train.shape)
print (X_test.shape)
#Variance/Values
print("Explained_variance_ratio")
print(pca.explained_variance_ratio_)
print("Singular_values")
print(pca.singular_values_)
#Plotting
print ("Graph")
plt.scatter (X_train[:,0], X_train[:,1], c=y_train, edgecolor='none', alpha=0.5, cmap=plt.cm.get_cmap('rainbow',6))
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.colorbar();
print ('You are looking at a high dimentional data explained by 2 components')
print ('Eeven though these components hold some information, but this to seperate the components apart')
print(pca.explained_variance_ratio_)
print(pca.singular_values_)
#Checking shape after scaling
print (X_train.shape)
print (y_train.shape)
print (X_train.shape)
# In[8]:
alphas = 10**np.linspace(10,-2,100)*0.5
alphas
# In[9]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge, Lasso
for Model in [Ridge, Lasso]:
model = Model()
print('%s: %s' % (Model.__name__,
cross_val_score(model, X, y).mean()))
# Out[9]:
Ridge: -1.3841312374053019
Lasso: -1.164517926682712
# In[10]:
import numpy as np
from matplotlib import pyplot as plt
alphas = np.logspace(-3, -1, 30)
plt.figure(figsize=(5, 3))
for Model in [Lasso, Ridge]:
scores = [cross_val_score(Model(alpha), X, y, cv=3).mean()
for alpha in alphas]
plt.plot(alphas, scores, label=Model.__name__)
plt.legend(loc='lower left')
plt.xlabel('alpha')
plt.ylabel('cross validation score')
plt.tight_layout()
plt.show()
# In[11]:
# alpha = 0.1
model = Ridge(alpha = 0.1)
model.fit(X_train,y_train)
print model.score(X_train,y_train)
print model.score(X_test,y_test)
# alpha = 0.01
model1 = Ridge(alpha = 0.01)
model.fit(X_train,y_train)
print model.score(X_train,y_train)
print model.score(X_test,y_test)
# alpha = 0.001
model2 = Ridge(alpha = 0.001)
model.fit(X_train,y_train)
print model.score(X_train,y_train)
print model.score(X_test,y_test)
# alpha = 0.0001
model3 = Ridge(alpha = 0.0001)
model.fit(X_train,y_train)
print model.score(X_train,y_train)
print model.score(X_test,y_test)
# Out[11]:
0.9999996833724945
-0.4120322763917558
0.9999996833724945
-0.4120322763917558
0.9999996833724945
-0.4120322763917558
0.9999996833724945
-0.4120322763917558
# In[12]:
modelCV = RidgeCV(alphas = [0.1, 0.01, 0.001,0.0001], store_cv_values = True)
modelCV.fit(X_train,y_train)
modelCV.alpha_ #giving 0.1
print modelCV.score(X_train,y_train) # giving 0.36898424479812919 which is the same score as ridge regression with alpha = 0.1
print modelCV.score(X_test,y_test)
# Out[12]:
0.9999996833724951
-0.41203227638984496