HOME/Articles/

ダブルクロスバリデーション(Double Cross Validation, Nested Cross Validation)のコード

Article Outline

ダブルクロスバリデーション(Double Cross Validation, Nested Cross Validation)のコード

NGboost

# 弱学習器
dtr_friedman_3 = DecisionTreeRegressor(criterion='friedman_mse', max_depth=3)

# NGBoostのパラメータ
FIXED_PARAMS = {"Base": dtr_friedman_3,
                "natural_gradient": True,
                "random_state": 0,
                "Dist": Normal, 
                "verbose": False,
                "Score": LogScore,
               }

# early_stopping_roundsにかけるか否か
adjust = True
# shapを見るかどうか
shap_show = True

# Outer loop
cv_outer = sklearn.model_selection.LeaveOneOut()
# Inner Loop
# early_stopping_roundsにかける場合、Inner Loopでストップをかける
cv_inner=sklearn.model_selection.LeaveOneOut()

# 結果格納のリスト
outer_scores = list()
outer_trues = list()
outer_preds = list()
outer_stds = list()
models = list()
for train_ix, valid_ix in tqdm(cv_outer.split(Xt_outer, yt_outer)):
    # 学習データと検証データ
    Xt, Xv = Xt_outer.iloc[train_ix, :], Xt_outer.iloc[valid_ix, :]
    yt, yv = yt_outer.iloc[train_ix], yt_outer.iloc[valid_ix]

    # GridSearchCVをInner Loopでする場合のコード
    # # inner loop
    # param_grid = {
    #     "n_estimators": [100, 500],
    #     #"col_sample": [0.01, 0.1, 1.0],
    #     # "minibatch_frac": [0.1, 1.0],
    #     "learning_rate": [0.01, 0.05, 0.1],
    #     'tol': [1e-4, 0.01]
    # }
    # ngb = ngboost.NGBRegressor(**FIXED_PARAMS)
    # clf = sklearn.model_selection.GridSearchCV(ngb, param_grid=param_grid, cv=cv_inner, scoring='neg_mean_absolute_error')
    # clf.fit(Xt, yt)
    # best_param = clf.best_params_
    # best_param.update(FIXED_PARAMS)
    # ngb = ngboost.NGBRegressor(**best_param)

    # Inner Loopでearly_stopping_roundsにかけて、良さげなn_estimatorsを設定
    if adjust:
        itrs = []
        for train_ix2, valid_ix2 in (cv_inner.split(Xt, yt)):
            Xt_inner, Xv_inner = Xt.iloc[train_ix2, :], Xt.iloc[valid_ix2, :]
            yt_inner, yv_inner = yt.iloc[train_ix2], yt.iloc[valid_ix2]
            ngb = ngboost.NGBRegressor(n_estimators=500, verbose=False)
            ngb.fit(Xt_inner, yt_inner, X_val=Xv_inner, Y_val=yv_inner, early_stopping_rounds=50)
            itrs.append(ngb.best_val_loss_itr)
        print('n_estimators', int(np.mean(itrs)))
        ngb = ngboost.NGBRegressor(**FIXED_PARAMS, n_estimators=int(np.mean(itrs)))
    # early_stopping_roundsにかけないのでInner Loop使わない
    else:
        ngb = ngboost.NGBRegressor(**FIXED_PARAMS, n_estimators=250)

    ngb.fit(Xt, yt)
    # 予測値と予測のばらつき取得
    yhat = ngb.pred_dist(Xv).params['loc']
    ystd = ngb.pred_dist(Xv).params['scale']
    # score
    mae = sklearn.metrics.mean_absolute_error(yv, yhat)
    # リストに格納
    outer_scores.append(mae)
    outer_preds.append(yhat[0])
    outer_stds.append(ystd[0])
    outer_trues.append(yv.to_numpy()[0])
    models.append(ngb)

    # 毎回のOuter Loopの検証結果をShapで表示
    if shap_show:
        X_v_shap = Xv.reset_index(drop=True).copy()
        explainer = shap.TreeExplainer(model=ngb, model_output=0)
        shap_values = explainer.shap_values(X=X_v_shap)
        shap.decision_plot(explainer.expected_value
                           , shap_values, X_v_shap
                           , ignore_warnings = True, highlight=[0])

LightGBM

# ハイパーパラメータ探索
search_params = False
if search_params:
    # LightGBMのハイパーパラメーター探索(OptunaのLightGBMTunerCV使用)
    params = {'task': 'train',
              'boosting_type': 'gbdt',
              'objective': 'binary',
              'metric': 'binary_logloss',
              'verbose': -1,
              'random_state': 0,  # 乱数シード
             }

    lgb_train = opt_lgb.Dataset(X_train, y_train, weight=sample_ws)  # set sample weights
    skf = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    # LightGBM学習
    tuner_cv = opt_lgb.LightGBMTunerCV(params, lgb_train
                                       , num_boost_round=1000
                                       , folds=skf
                                       , return_cvbooster=True
                                       , optuna_seed=0
                                       , callbacks=[opt_lgb.early_stopping(stopping_rounds=50, verbose=True)])

    # 最適なパラメータを探索する
    tuner_cv.run()
    # 最も良かったスコアとパラメータを書き出す
    print(f'Best score: {tuner_cv.best_score}')
    print('Best params:')
    print(tuner_cv.best_params)

params = tuner_cv.best_params

# 良さげな`num_boost_round`を探すために、nested cross validation
# early_stopping_roundsにかけるか否か
adjust = False
# shap計算するか否か
shap_culc = True

# Outer loop
cv_outer = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
# Inner Loop
# `num_boost_round`探索のため、Inner Loopでearly_stopping_roundsにかける
cv_inner = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# 結果格納のリスト
outer_trues = []
outer_preds = []
models = []
Xt_outer = X_train.copy()
yt_outer = y_train.copy()
shap_exp = []
shap_val0 = []
shap_val1 = []
valids = []

# Outer Loop
for cv_i, (train_ix, valid_ix) in tqdm(enumerate(cv_outer.split(Xt_outer, yt_outer))):
    # 学習データと検証データ
    Xt, Xv = Xt_outer.iloc[train_ix, :], Xt_outer.iloc[valid_ix, :]
    yt, yv = yt_outer.iloc[train_ix], yt_outer.iloc[valid_ix]
    sample_ws_t = sample_ws[train_ix]
    sample_ws_v = sample_ws[valid_ix]
    if adjust:
        itrs = []
        # Inner Loop
        for train_ix2, valid_ix2 in (cv_inner.split(Xt, yt)):
            Xt_inner, Xv_inner = Xt.iloc[train_ix2, :], Xt.iloc[valid_ix2, :]
            yt_inner, yv_inner = yt.iloc[train_ix2], yt.iloc[valid_ix2]
            sample_ws_t_t = sample_ws_t[train_ix2]
            lgb_train = lightgbm.Dataset(Xt_inner, yt_inner, weight=sample_ws_t_t)
            lgb_valid = lightgbm.Dataset(Xv_inner, yv_inner)
            # 最大1000でearly_stoppingをかける
            lgb = lightgbm.train(params, lgb_train, valid_sets=[lgb_valid], num_boost_round=1000
                                 , callbacks=[lightgbm.early_stopping(stopping_rounds=100, verbose=True)  # early_stopping用コールバック関数
                                              , lightgbm.log_evaluation(0)] # コマンドライン出力用コールバック関数)
                                )
            itrs.append(lgb.best_iteration)
        # 良さげな`num_boost_round`はcvの平均とする
        print('n_estimators', int(np.mean(itrs)))
        lgb_train = lightgbm.Dataset(Xt, yt, weight=sample_ws_t)
        lgb = lightgbm.train(params, lgb_train, num_boost_round=int(np.mean(itrs)))
    # early_stopping_roundsにかけないのでInner Loop使わない
    else:
        # Inner Loopにはかけず`num_boost_round`は固定
        lgb_train = lightgbm.Dataset(Xt, yt, weight=sample_ws_t)
        lgb = lightgbm.train(params, lgb_train, num_boost_round=365)

    # 予測値取得
    yhat = lgb.predict(Xv)
    # リストに格納
    outer_preds.append(yhat)  # 予測値
    outer_trues.append(yv.to_numpy())  # 実測値
    models.append(lgb)  # モデル
    valids.append(Xv)  # 検証データ

    # Shap計算
    if shap_culc:
        print('shap value calculation, outer cv {}'.format(cv_i))
        X_valid_shap = Xv.reset_index(drop=True)
        explainer = shap.TreeExplainer(model=lgb)
        shap_values = explainer.shap_values(X=X_valid_shap)
        shap_exp.append(explainer)
        shap_val0.append(shap_values[0])
        shap_val1.append(shap_values[1])