多个句子词频统计

使用collections.Counter计算词频,顺便去除只出现1次的单词。

from collections import Counter
cc = Counter()
def get_count(text):
    try:
        text_split = text.strip().split(' ')
        count = Counter(text_split)
        cc.update(count)
        return text_split
    except: 
        return text
def remove_one(text):
    try:
        text = u" ".join([x for x in [y for y in text.strip().split(u" ")] if cc[x] > 1])
        return text
    except: 
        return text
df['text_split'] = df['text'].apply(lambda x: get_count(x))
df['text'] = df['text'].apply(lambda x: remove_one(x))


文本预处理

过滤符号,去掉上标,转换为小写,非英文字符用空格隔开,连续重复字母数大于等于3的只保留1个,去掉指定单词中的空格。

import regex as re
import unicodedata
def process(text):
    try:
        text = re.sub(ur"\p{P}+|\p{Z}+|\p{S}+|\p{N}+", u' ', text)
        text = unicodedata.normalize('NFKD',text)#.encode('ascii','ignore')
        text = re.sub(ur"\p{M}+", u'', text)
        text = re.sub(ur"\p{P}+|\p{S}+|\p{N}+|\p{Cs}+|\p{Cf}+|\p{Co}+", u'', text)
        text = re.sub("([A-Za-z]+)", lambda m:m.group(1).lower(),text)
        text = re.sub(ur'([^\x00-\x7f])', lambda m:u' '+m.group(1)+u' ', text)
        text = re.sub(ur"(\w)\1{2,}",lambda m:m.group(1), text)
        text = re.sub("(\s+)", u' ',text)
        for fword in fword_list:
            f_re = ''
            for i in xrange(len(fword)):
                w = fword[i]
                f_re += w + "+\s*" if i < (len(fword)-1) else w + "+" 
            text = re.sub(f_re, u' '+fword+u' ',text)
        text = re.sub("(\s+)", u' ',text)
        return text
    except: 
        return text
df['text'] = df['text'].apply(lambda x: process(x))


回归LightGBM Cross Validation

使用LightGBM,Cross Validation进行回归,RMSE loss。

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import random
import os
from lightgbm import LGBMClassifier
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import gc
def kfold_lightgbm(train_df, test_df, y, num_folds, stratified = False, debug= False):
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=17)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=17)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = train.columns.tolist()
    test_df = test_df[feats]
    #test_df = csr_matrix(test_df)
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df, y)):
        print('FOLD {}'.format(n_fold))
        train_x, train_y = train_df.iloc[train_idx], y.iloc[train_idx]
        valid_x, valid_y = train_df.iloc[valid_idx], y.iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        lgb_params =  {
            'task': 'train',
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': 'rmse',
            #"n_estimators":10000,
            "learning_rate": 0.01,
            
            'num_leaves': 60,
            'subsample': 0.6143,
            'colsample_bytree': 0.6453,
            'min_split_gain': np.power(10, -2.5988),
            'reg_alpha': np.power(10, -2.2887),
            'reg_lambda': np.power(10, 1.7570),
            'min_child_weight': np.power(10, -0.1477),
            'max_depth': -1,
            #'zero_as_missing':True
        }
        '''
        lgb_params = {
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': 0.01,
            'num_leaves': 16,
            'max_depth': -1,
            'min_child_samples': 1,
            'max_bin': 300,
            'subsample': 1.0,
            'subsample_freq': 1,
            'colsample_bytree': 0.5,
            'min_child_weight': 10,
            'reg_lambda': 0.1,
            'reg_alpha': 0.0,
            'scale_pos_weight': 1,
            'zero_as_missing': True,
            'num_threads': -1,
        }
        '''
        #train_x = csr_matrix(train_x)
        #valid_x = csr_matrix(valid_x)
        lgtrain = lgb.Dataset(train_x, train_y,
                        feature_name=feats,
                        categorical_feature = 'auto')
        lgvalid = lgb.Dataset(valid_x, valid_y,
                        feature_name=feats,
                        categorical_feature = 'auto')
        clf = lgb.train(
            lgb_params,
            lgtrain,
            num_boost_round=3000,
            valid_sets=[lgtrain, lgvalid],
            valid_names=['train','valid'],
            early_stopping_rounds=200,
            verbose_eval=100
        )
        
       # clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            #eval_metric= 'auc', verbose= 100, early_stopping_rounds= 200)

        #oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        #sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
        
        oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration)
        sub_preds += clf.predict(test_df, num_iteration=clf.best_iteration)/ folds.n_splits
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importance()
        #fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        #print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        print('Fold %2d RMSE : %.6f' % (n_fold, np.sqrt(metrics.mean_squared_error(valid_y, oof_preds[valid_idx]))))      
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    rmse = np.sqrt(metrics.mean_squared_error(y, oof_preds))
    #print('Full AUC score %.6f' % roc_auc_score(y, oof_preds))
    print('Full RMSE score %.6f' % rmse)
    
    display_importances(feature_importance_df)
    return feature_importance_df, sub_preds
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgb_importances.png')

多分类LightGBM Cross Validation

使用LightGBM,Cross Validation进行多分类,multi log loss。

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import random
import os
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import gc
def multi_log_loss(y_true, y_pred, num_classes):  # score function for CV
    esp = 1e-12
    y_pred += esp
    y_true = y_true.astype('int')
    # Handle all zeroes
    all_zeros = np.all(y_pred == 0, axis=1)
    y_pred[all_zeros] = 1/num_classes
    # Normalise sum of row probabilities to one
    row_sums = np.sum(y_pred, axis=1)
    y_pred /= row_sums.reshape((-1, 1))
    # Calculate score
    n_rows = y_true.size
    score_sum = 0
    for i in range(y_true.size):
        score_sum -= np.log(y_pred[i, y_true[i]])
    score = score_sum / n_rows
    return score
def kfold_lightgbm(train_df, test_df, y, num_folds, stratified = False, debug= False):
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=SEED)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=SEED)
    num_classes = 6
    # Create arrays and dataframes to store results
    oof_preds = np.zeros((train_df.shape[0],num_classes))
    sub_preds = np.zeros((test_df.shape[0],num_classes))
    feature_importance_df = pd.DataFrame()
    feats = train_df.columns.tolist()
    cat_feats = 'auto'
    #feats = select
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], y)):
        train_x, train_y = train_df[feats].iloc[train_idx], y.iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], y.iloc[valid_idx]
        
        lgtrain = lgb.Dataset(train_x, train_y,
                        feature_name=feats,
                        categorical_feature = cat_feats)
        lgvalid = lgb.Dataset(valid_x, valid_y,
                        feature_name=feats,
                        categorical_feature = cat_feats)

        print('get lgb train valid dataset end')
        lgb_params =  {
            'task': 'train',
            'boosting_type': 'gbdt',
            'objective': 'multiclass',
            'num_class':num_classes,
            'metric': 'multi_logloss',
            #"n_estimators":10000,
            
            "learning_rate": 0.02,
            #"num_leaves": 200,
            #"feature_fraction": 0.50,
            #"bagging_fraction": 0.50,
            #'bagging_freq': 4,
            #"max_depth": -1,
            
            'num_leaves': 32,
            'max_depth': 8,
            'bagging_fraction': 0.7,
            'bagging_freq': 5,
            'feature_fraction': 0.7,
            
            "reg_alpha": 0.3,
            "reg_lambda": 0.1,
            'min_child_samples': 100,
            #'max_bin': 100,
            "min_split_gain":0.2,
            'nthread': 4,
            "min_child_weight":10,
        }
        clf = lgb.train(
            lgb_params,
            lgtrain,
            num_boost_round=3000,
            valid_sets=[lgtrain, lgvalid],
            valid_names=['train','valid'],
            early_stopping_rounds=100,
            verbose_eval=100
        )

        #clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
        #    eval_metric= 'auc', verbose= 100, early_stopping_rounds= 200)

        #oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        #sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
        
        oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration)
        sub_preds += clf.predict(test_df, num_iteration=clf.best_iteration)/ folds.n_splits
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importance()
        #fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d Multi Log Loss : %.6f' % (n_fold + 1, multi_log_loss(valid_y.values, oof_preds[valid_idx], num_classes)))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full Multi Log Loss %.6f' % multi_log_loss(y.values, oof_preds, num_classes))
    display_importances(feature_importance_df)
    return feature_importance_df,sub_preds
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgb_importances.png')

one_hot_encoder和label_encoder

one_hot_encoder和label_encoder可以处理类别类型的特征。

import pandas as pd
from sklearn.preprocessing import LabelEncoder
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns = categorical_columns, dummy_na = nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns
def label_encoder(df):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    for col in categorical_columns:
        df[col] = LabelEncoder().fit_transform(df[col].astype('str'))
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, categorical_columns