使用 进行网格搜索后,我的 xgboost 模型精度降低

数据挖掘 xgboost 网格搜索
2022-02-25 00:47:02

我尝试在 XGBoost 分类器中进行网格搜索以进行超参数调整,但最佳准确度低于没有任何调整的准确度

// this is the code before the grid search
xg_cl = xgb.XGBClassifier(objective='binary:logistic', seed = 22)
xg_cl.fit(x_train, y_train)
y_pred = xg_cl.predict(x_test)
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.accuracy_score(y_test, y_pred))
[[11  1]
[ 1 26]]
0.9487179487179487

另外,值得一提的是,PCA 之后的数据集形状是 (195, 11),我正在尝试对患者是否患有帕金森病进行分类。

// this is the grid search code
clf_xgb = xgb.XGBClassifier(objective = 'binary:logistic')
params__grid = {
    'n_estimators' : range(50,150,10),
    'max_depth': range(2, 12),
    'colsample_bytree': np.arange(0.5,1,0.1),
    'reg_alpha' : np.arange(0,0.6,0.1),
    'reg_lambda' : np.arange(0,0.8,0.1)


}
search = GridSearchCV(estimator=clf_xgb, param_grid=params__grid, scoring = 'accuracy',
                            cv = 4 )
search.fit(x_train,y_train)

print('best score:/n',search.best_score_)
print('bestparams:/n' ,search.best_params_)

best score:
0.9038461538461539

best params:
{'colsample_bytree': 0.5,
     'max_depth': 7,
     'n_estimators': 50,
     'reg_alpha': 0.2,
     'reg_lambda': 0.1}

然后我使用这些参数来构建和训练一个新的分类器

clf_xgb_1 = xgb.XGBClassifier(objective = 'binary:logistic', max_depth = 7, n_estimators = 50, reg_alpha = 0.2 ,
                              reg_lambda = 0.1, colsample_bytree = 0.5 )
clf_xgb_1.fit(x_train,y_train)
y_pred_2 = clf_xgb_1.predict(x_test)
print('accuracy:/n', metrics.accuracy_score(y_test,y_pred_2))
print('confusion matrix:/n', metrics.confusion_matrix(y_test,y_pred_2))

accuracy:
0.8974358974358975

confusion matrix:
array([[ 9,  3],
       [ 1, 26]], dtype=int64)

为什么我的结果更糟?我希望 GridSearch 会改进结果。

1个回答

我认为有几点需要考虑:

  • 首先,在这种情况下,默认的 XGBoost 超参数可能是您通过params__grid组合传递的更好的组合,您可以检查它

  • 尽管它没有解释您的情况,但请记住,GridSearchCV 对象给出的best_score是 best_estimator (source )的平均交叉验证分数,它是 k 折上 k 次训练的平均测试分数,所以它可能会给你一个更可靠的分值(还包括 k 分数的标准偏差)

还要考虑到您不需要使用网格搜索 CV 中的最佳参数重新训练您的模型,因为您可以通过search.best_model访问最佳模型

最后,我还推荐其他超参数调整方法,例如贝叶斯调整,下面是使用 hyperopt 的示例代码要点(有关贝叶斯优化的信息here):

from hyperopt import hp
from sklearn.model_selection import StratifiedKFold

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score

def return_model_scores(params, dataX, dataY, n_folds=5):
    import numpy as np

    defined_model, edp_cost_scores, histories = 
    evaluate_model_cost(dataX = dataX, dataY = dataY)   

return np.array(edp_cost_scores).mean()


param_space = {'max_depth' : hp.choice('max_depth', range(5, 30, 1)),
  'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01),
  'n_estimators' : hp.choice('n_estimators', range(20, 205, 5)),
  'gamma' : hp.quniform('gamma', 0, 0.50, 0.01),
  'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
  'subsample' : hp.quniform('subsample', 0.1, 1, 0.01),
  'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.01),
  'scale_pos_weight': hp.uniform('scale_pos_weight', 0.2, 0.8)}

# Some variable 
dataX = X_train
dataY = y_train
assert len(X_train)==len(y_train)
n_folds=5

global best # global variable defined for convenience of this use case
best = 0
i = 0
def f(params):
    cost = return_model_scores(params, dataX, dataY, n_folds=5)  
    if i == 0:
        best = cost

    if cost < best:
        best = cost
    print('new best:', best, params)

    return {'loss': cost, 'status': STATUS_OK}

trials = Trials()
best = fmin(f, param_space, algo=tpe.suggest, max_evals=10, trials=trials)

print ('best:')
print (best)

其中evaluate_model_cost是您为在评估后获取成本值而构建的函数:

def evaluate_model_cost(dataX, dataY):
    from tqdm import tqdm
    from sklearn.model_selection import KFold

    scores, histories = list(), list()
    # prepare cross validation
    kfold = KFold(10, shuffle=True, random_state=1)
    # enumerate splits
    k = 0
    for train_ix, test_ix in tqdm(kfold.split(dataX)):
        print('kfold {}'.format(k))

        # select rows for train and test
        trainX, trainY, testX, testY = dataX.iloc[train_ix], 
          dataY.iloc[train_ix], dataX.iloc[test_ix], dataY.iloc[test_ix]

        # fit model
        history = defined_model.fit(trainX, trainY, 
                                eval_metric= 'auc',
                                eval_set=[(testX, testY)])                               
    # evaluate model
    y_preds = defined_model.predict(testX[features_to_train_on])

    y_true_values = testY
    true_predicted_tuples = pd.DataFrame({'y_true': y_true_values, 'y_predicted': y_preds, 'days_till_slag': days_until_slag})
    true_predicted_tuples = true_predicted_tuples.reset_index(drop=True)
    #true positives, false positives and false negatives number
    tp_savings = 0

    first_tp_detected = False

    while first_tp_detected==False:
        for index in true_predicted_tuples.index:
            # TRUE POSITIVE condition:
            if ((true_predicted_tuples.iloc[index]['y_true']==1)&(true_predicted_tuples.iloc[index]['y_predicted']==1)):
                tp_savings += return_true_positive_savings(true_predicted_tuples.iloc[index]['days_till_slag'])
            break
        first_tp_detected = True

    fp_number = len(true_predicted_tuples[(true_predicted_tuples.y_true==0)&(true_predicted_tuples.y_predicted==1)])
    fn_number = len(true_predicted_tuples[(true_predicted_tuples.y_true==1)&(true_predicted_tuples.y_predicted==0)])

    final_cost = ((costs_dict['fp_cost'])*fp_number) + 
        ((costs_dict['fn_cost'])*fn_number) - tp_savings

    score = final_cost_custom_function

    print('score en evaluate_model_with_slag_days', score)

    # append scores
    scores.append(score)
    histories.append(history)

    k = k + 1

return defined_model, scores, histories

其中final_cost_custom_function是您要最小化的成本自定义函数。这可能有点复杂,如果您想要一个非常简单的贝叶斯优化示例,请从以下内容开始:

import pickle
import time
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

def objective(x):
    return {
     'loss': x ** 2, 
     'status': STATUS_OK,
     # -- store other results like this
     'eval_time': time.time(),
     'other_stuff': {'type': None, 'value': [0, 1, 2]},
     # -- attachments are handled differently
     'attachments':
        {'time_module': pickle.dumps(time.time)}
     }
trials = Trials()
best = fmin(objective,
 space=hp.uniform('x', -3, 3),
 algo=tpe.suggest,
 max_evals=100,
 trials=trials)

print('with 100 trials: ', best)

trials = Trials()
best = fmin(objective, space=hp.uniform('x', -3, 3), algo=tpe.suggest,
 max_evals=1000, trials=trials)

print('with 1000 trials: ', best)