多分类LightGBM Cross Validation
使用LightGBM,Cross Validation进行多分类,multi log loss。
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import random
import os
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import gc
def multi_log_loss(y_true, y_pred, num_classes): # score function for CV
esp = 1e-12
y_pred += esp
y_true = y_true.astype('int')
# Handle all zeroes
all_zeros = np.all(y_pred == 0, axis=1)
y_pred[all_zeros] = 1/num_classes
# Normalise sum of row probabilities to one
row_sums = np.sum(y_pred, axis=1)
y_pred /= row_sums.reshape((-1, 1))
# Calculate score
n_rows = y_true.size
score_sum = 0
for i in range(y_true.size):
score_sum -= np.log(y_pred[i, y_true[i]])
score = score_sum / n_rows
return score
def kfold_lightgbm(train_df, test_df, y, num_folds, stratified = False, debug= False):
if stratified:
folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=SEED)
else:
folds = KFold(n_splits= num_folds, shuffle=True, random_state=SEED)
num_classes = 6
# Create arrays and dataframes to store results
oof_preds = np.zeros((train_df.shape[0],num_classes))
sub_preds = np.zeros((test_df.shape[0],num_classes))
feature_importance_df = pd.DataFrame()
feats = train_df.columns.tolist()
cat_feats = 'auto'
#feats = select
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], y)):
train_x, train_y = train_df[feats].iloc[train_idx], y.iloc[train_idx]
valid_x, valid_y = train_df[feats].iloc[valid_idx], y.iloc[valid_idx]
lgtrain = lgb.Dataset(train_x, train_y,
feature_name=feats,
categorical_feature = cat_feats)
lgvalid = lgb.Dataset(valid_x, valid_y,
feature_name=feats,
categorical_feature = cat_feats)
print('get lgb train valid dataset end')
lgb_params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'multiclass',
'num_class':num_classes,
'metric': 'multi_logloss',
#"n_estimators":10000,
"learning_rate": 0.02,
#"num_leaves": 200,
#"feature_fraction": 0.50,
#"bagging_fraction": 0.50,
#'bagging_freq': 4,
#"max_depth": -1,
'num_leaves': 32,
'max_depth': 8,
'bagging_fraction': 0.7,
'bagging_freq': 5,
'feature_fraction': 0.7,
"reg_alpha": 0.3,
"reg_lambda": 0.1,
'min_child_samples': 100,
#'max_bin': 100,
"min_split_gain":0.2,
'nthread': 4,
"min_child_weight":10,
}
clf = lgb.train(
lgb_params,
lgtrain,
num_boost_round=3000,
valid_sets=[lgtrain, lgvalid],
valid_names=['train','valid'],
early_stopping_rounds=100,
verbose_eval=100
)
#clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)],
# eval_metric= 'auc', verbose= 100, early_stopping_rounds= 200)
#oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
#sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration)
sub_preds += clf.predict(test_df, num_iteration=clf.best_iteration)/ folds.n_splits
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = feats
fold_importance_df["importance"] = clf.feature_importance()
#fold_importance_df["importance"] = clf.feature_importances_
fold_importance_df["fold"] = n_fold + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
print('Fold %2d Multi Log Loss : %.6f' % (n_fold + 1, multi_log_loss(valid_y.values, oof_preds[valid_idx], num_classes)))
del clf, train_x, train_y, valid_x, valid_y
gc.collect()
print('Full Multi Log Loss %.6f' % multi_log_loss(y.values, oof_preds, num_classes))
display_importances(feature_importance_df)
return feature_importance_df,sub_preds
# Display/plot feature importance
def display_importances(feature_importance_df_):
cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
plt.figure(figsize=(8, 10))
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgb_importances.png')