回归LightGBM Cross Validation
使用LightGBM,Cross Validation进行回归,RMSE loss。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | import lightgbm as lgb import matplotlib.pyplot as plt import numpy as np import random import os from lightgbm import LGBMClassifier from sklearn import metrics from sklearn.metrics import mean_squared_error from sklearn.metrics import roc_auc_score, roc_curve from sklearn.metrics import log_loss from sklearn.model_selection import KFold, StratifiedKFold from sklearn.preprocessing import LabelEncoder import seaborn as sns import gc def kfold_lightgbm(train_df, test_df, y, num_folds, stratified = False , debug = False ): if stratified: folds = StratifiedKFold(n_splits = num_folds, shuffle = True , random_state = 17 ) else : folds = KFold(n_splits = num_folds, shuffle = True , random_state = 17 ) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[ 0 ]) sub_preds = np.zeros(test_df.shape[ 0 ]) feature_importance_df = pd.DataFrame() feats = train.columns.tolist() test_df = test_df[feats] #test_df = csr_matrix(test_df) for n_fold, (train_idx, valid_idx) in enumerate (folds.split(train_df, y)): print ( 'FOLD {}' . format (n_fold)) train_x, train_y = train_df.iloc[train_idx], y.iloc[train_idx] valid_x, valid_y = train_df.iloc[valid_idx], y.iloc[valid_idx] # LightGBM parameters found by Bayesian optimization lgb_params = { 'task' : 'train' , 'boosting_type' : 'gbdt' , 'objective' : 'regression' , 'metric' : 'rmse' , #"n_estimators":10000, "learning_rate" : 0.01 , 'num_leaves' : 60 , 'subsample' : 0.6143 , 'colsample_bytree' : 0.6453 , 'min_split_gain' : np.power( 10 , - 2.5988 ), 'reg_alpha' : np.power( 10 , - 2.2887 ), 'reg_lambda' : np.power( 10 , 1.7570 ), 'min_child_weight' : np.power( 10 , - 0.1477 ), 'max_depth' : - 1 , #'zero_as_missing':True } ''' lgb_params = { 'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.01, 'num_leaves': 16, 'max_depth': -1, 'min_child_samples': 1, 'max_bin': 300, 'subsample': 1.0, 'subsample_freq': 1, 'colsample_bytree': 0.5, 'min_child_weight': 10, 'reg_lambda': 0.1, 'reg_alpha': 0.0, 'scale_pos_weight': 1, 'zero_as_missing': True, 'num_threads': -1, } ''' #train_x = csr_matrix(train_x) #valid_x = csr_matrix(valid_x) lgtrain = lgb.Dataset(train_x, train_y, feature_name = feats, categorical_feature = 'auto' ) lgvalid = lgb.Dataset(valid_x, valid_y, feature_name = feats, categorical_feature = 'auto' ) clf = lgb.train( lgb_params, lgtrain, num_boost_round = 3000 , valid_sets = [lgtrain, lgvalid], valid_names = [ 'train' , 'valid' ], early_stopping_rounds = 200 , verbose_eval = 100 ) # clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], #eval_metric= 'auc', verbose= 100, early_stopping_rounds= 200) #oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1] #sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits oof_preds[valid_idx] = clf.predict(valid_x, num_iteration = clf.best_iteration) sub_preds + = clf.predict(test_df, num_iteration = clf.best_iteration) / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df[ "feature" ] = feats fold_importance_df[ "importance" ] = clf.feature_importance() #fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df[ "fold" ] = n_fold + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis = 0 ) #print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) print ( 'Fold %2d RMSE : %.6f' % (n_fold, np.sqrt(metrics.mean_squared_error(valid_y, oof_preds[valid_idx])))) del clf, train_x, train_y, valid_x, valid_y gc.collect() rmse = np.sqrt(metrics.mean_squared_error(y, oof_preds)) #print('Full AUC score %.6f' % roc_auc_score(y, oof_preds)) print ( 'Full RMSE score %.6f' % rmse) display_importances(feature_importance_df) return feature_importance_df, sub_preds # Display/plot feature importance def display_importances(feature_importance_df_): cols = feature_importance_df_[[ "feature" , "importance" ]].groupby( "feature" ).mean().sort_values(by = "importance" , ascending = False )[: 40 ].index best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)] plt.figure(figsize = ( 8 , 10 )) sns.barplot(x = "importance" , y = "feature" , data = best_features.sort_values(by = "importance" , ascending = False )) plt.title( 'LightGBM Features (avg over folds)' ) plt.tight_layout() plt.savefig( 'lgb_importances.png' ) |