数据挖掘 - 使用进行网格搜索后，我的 xgboost 模型精度降低 - 吾爱随笔录

使用进行网格搜索后，我的 xgboost 模型精度降低

数据挖掘 xgboost 网格搜索

2022-02-25 00:47:02

我尝试在 XGBoost 分类器中进行网格搜索以进行超参数调整，但最佳准确度低于没有任何调整的准确度

// this is the code before the grid search
xg_cl = xgb.XGBClassifier(objective='binary:logistic', seed = 22)
xg_cl.fit(x_train, y_train)
y_pred = xg_cl.predict(x_test)
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.accuracy_score(y_test, y_pred))
[[11  1]
[ 1 26]]
0.9487179487179487

另外，值得一提的是，PCA 之后的数据集形状是 (195, 11)，我正在尝试对患者是否患有帕金森病进行分类。

// this is the grid search code
clf_xgb = xgb.XGBClassifier(objective = 'binary:logistic')
params__grid = {
    'n_estimators' : range(50,150,10),
    'max_depth': range(2, 12),
    'colsample_bytree': np.arange(0.5,1,0.1),
    'reg_alpha' : np.arange(0,0.6,0.1),
    'reg_lambda' : np.arange(0,0.8,0.1)


}
search = GridSearchCV(estimator=clf_xgb, param_grid=params__grid, scoring = 'accuracy',
                            cv = 4 )
search.fit(x_train,y_train)

print('best score:/n',search.best_score_)
print('bestparams:/n' ,search.best_params_)

best score:
0.9038461538461539

best params:
{'colsample_bytree': 0.5,
     'max_depth': 7,
     'n_estimators': 50,
     'reg_alpha': 0.2,
     'reg_lambda': 0.1}

然后我使用这些参数来构建和训练一个新的分类器

clf_xgb_1 = xgb.XGBClassifier(objective = 'binary:logistic', max_depth = 7, n_estimators = 50, reg_alpha = 0.2 ,
                              reg_lambda = 0.1, colsample_bytree = 0.5 )
clf_xgb_1.fit(x_train,y_train)
y_pred_2 = clf_xgb_1.predict(x_test)
print('accuracy:/n', metrics.accuracy_score(y_test,y_pred_2))
print('confusion matrix:/n', metrics.confusion_matrix(y_test,y_pred_2))

accuracy:
0.8974358974358975

confusion matrix:
array([[ 9,  3],
       [ 1, 26]], dtype=int64)

为什么我的结果更糟？我希望 GridSearch 会改进结果。

1个回答

我认为有几点需要考虑：

首先，在这种情况下，默认的 XGBoost 超参数可能是您通过params__grid组合传递的更好的组合，您可以检查它
尽管它没有解释您的情况，但请记住，GridSearchCV 对象给出的best_score是 best_estimator （source ）的平均交叉验证分数，它是 k 折上 k 次训练的平均测试分数，所以它可能会给你一个更可靠的分值（还包括 k 分数的标准偏差）

还要考虑到您不需要使用网格搜索 CV 中的最佳参数重新训练您的模型，因为您可以通过search.best_model访问最佳模型

最后，我还推荐其他超参数调整方法，例如贝叶斯调整，下面是使用 hyperopt 的示例代码要点（有关贝叶斯优化的信息here）：

from hyperopt import hp
from sklearn.model_selection import StratifiedKFold

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score

def return_model_scores(params, dataX, dataY, n_folds=5):
    import numpy as np

    defined_model, edp_cost_scores, histories = 
    evaluate_model_cost(dataX = dataX, dataY = dataY)   

return np.array(edp_cost_scores).mean()


param_space = {'max_depth' : hp.choice('max_depth', range(5, 30, 1)),
  'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01),
  'n_estimators' : hp.choice('n_estimators', range(20, 205, 5)),
  'gamma' : hp.quniform('gamma', 0, 0.50, 0.01),
  'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
  'subsample' : hp.quniform('subsample', 0.1, 1, 0.01),
  'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.01),
  'scale_pos_weight': hp.uniform('scale_pos_weight', 0.2, 0.8)}

# Some variable 
dataX = X_train
dataY = y_train
assert len(X_train)==len(y_train)
n_folds=5

global best # global variable defined for convenience of this use case
best = 0
i = 0
def f(params):
    cost = return_model_scores(params, dataX, dataY, n_folds=5)  
    if i == 0:
        best = cost

    if cost < best:
        best = cost
    print('new best:', best, params)

    return {'loss': cost, 'status': STATUS_OK}

trials = Trials()
best = fmin(f, param_space, algo=tpe.suggest, max_evals=10, trials=trials)

print ('best:')
print (best)

其中evaluate_model_cost是您为在评估后获取成本值而构建的函数：

def evaluate_model_cost(dataX, dataY):
    from tqdm import tqdm
    from sklearn.model_selection import KFold

    scores, histories = list(), list()
    # prepare cross validation
    kfold = KFold(10, shuffle=True, random_state=1)
    # enumerate splits
    k = 0
    for train_ix, test_ix in tqdm(kfold.split(dataX)):
        print('kfold {}'.format(k))

        # select rows for train and test
        trainX, trainY, testX, testY = dataX.iloc[train_ix], 
          dataY.iloc[train_ix], dataX.iloc[test_ix], dataY.iloc[test_ix]

        # fit model
        history = defined_model.fit(trainX, trainY, 
                                eval_metric= 'auc',
                                eval_set=[(testX, testY)])                               
    # evaluate model
    y_preds = defined_model.predict(testX[features_to_train_on])

    y_true_values = testY
    true_predicted_tuples = pd.DataFrame({'y_true': y_true_values, 'y_predicted': y_preds, 'days_till_slag': days_until_slag})
    true_predicted_tuples = true_predicted_tuples.reset_index(drop=True)
    #true positives, false positives and false negatives number
    tp_savings = 0

    first_tp_detected = False

    while first_tp_detected==False:
        for index in true_predicted_tuples.index:
            # TRUE POSITIVE condition:
            if ((true_predicted_tuples.iloc[index]['y_true']==1)&(true_predicted_tuples.iloc[index]['y_predicted']==1)):
                tp_savings += return_true_positive_savings(true_predicted_tuples.iloc[index]['days_till_slag'])
            break
        first_tp_detected = True

    fp_number = len(true_predicted_tuples[(true_predicted_tuples.y_true==0)&(true_predicted_tuples.y_predicted==1)])
    fn_number = len(true_predicted_tuples[(true_predicted_tuples.y_true==1)&(true_predicted_tuples.y_predicted==0)])

    final_cost = ((costs_dict['fp_cost'])*fp_number) + 
        ((costs_dict['fn_cost'])*fn_number) - tp_savings

    score = final_cost_custom_function

    print('score en evaluate_model_with_slag_days', score)

    # append scores
    scores.append(score)
    histories.append(history)

    k = k + 1

return defined_model, scores, histories

其中final_cost_custom_function是您要最小化的成本自定义函数。这可能有点复杂，如果您想要一个非常简单的贝叶斯优化示例，请从以下内容开始：

import pickle
import time
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

def objective(x):
    return {
     'loss': x ** 2, 
     'status': STATUS_OK,
     # -- store other results like this
     'eval_time': time.time(),
     'other_stuff': {'type': None, 'value': [0, 1, 2]},
     # -- attachments are handled differently
     'attachments':
        {'time_module': pickle.dumps(time.time)}
     }
trials = Trials()
best = fmin(objective,
 space=hp.uniform('x', -3, 3),
 algo=tpe.suggest,
 max_evals=100,
 trials=trials)

print('with 100 trials: ', best)

trials = Trials()
best = fmin(objective, space=hp.uniform('x', -3, 3), algo=tpe.suggest,
 max_evals=1000, trials=trials)

print('with 1000 trials: ', best)

其它你可能感兴趣的问题

上一篇合适的数据可视化？下一篇创建 CNN 模型图：不能使用 keras.utils.vis_utils 中的 plot_model

使用 进行网格搜索后，我的 xgboost 模型精度降低

使用进行网格搜索后，我的 xgboost 模型精度降低