# Installing the libraries with the specified version.
# !pip install pandas==1.5.3 numpy==1.25.2 matplotlib==3.7.1 seaborn==0.13.1 scikit-learn==1.2.2 imbalanced-learn==0.10.1 xgboost==2.0.3 threadpoolctl==3.3.0 -q --user

# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np

# Libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# To tune model, get different metric scores, and split data
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    roc_auc_score,
    ConfusionMatrixDisplay,
)
from sklearn import metrics

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

# To impute missing values
from sklearn.impute import SimpleImputer

# To oversample and undersample data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# To do hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

# To supress scientific notations for a dataframe
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# To help with model building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    BaggingClassifier,
)
from xgboost import XGBClassifier

# To suppress scientific notations
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# To suppress warnings
import warnings

warnings.filterwarnings("ignore")

# uncomment and run the following lines for Google Colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

# loading data
train_data = pd.read_csv('/content/drive/MyDrive/content/Train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/content/Test.csv')

# data head
train_data.head()

# data shape
train_data.shape

(20000, 41)

# data details
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 41 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V1      19982 non-null  float64
 1   V2      19982 non-null  float64
 2   V3      20000 non-null  float64
 3   V4      20000 non-null  float64
 4   V5      20000 non-null  float64
 5   V6      20000 non-null  float64
 6   V7      20000 non-null  float64
 7   V8      20000 non-null  float64
 8   V9      20000 non-null  float64
 9   V10     20000 non-null  float64
 10  V11     20000 non-null  float64
 11  V12     20000 non-null  float64
 12  V13     20000 non-null  float64
 13  V14     20000 non-null  float64
 14  V15     20000 non-null  float64
 15  V16     20000 non-null  float64
 16  V17     20000 non-null  float64
 17  V18     20000 non-null  float64
 18  V19     20000 non-null  float64
 19  V20     20000 non-null  float64
 20  V21     20000 non-null  float64
 21  V22     20000 non-null  float64
 22  V23     20000 non-null  float64
 23  V24     20000 non-null  float64
 24  V25     20000 non-null  float64
 25  V26     20000 non-null  float64
 26  V27     20000 non-null  float64
 27  V28     20000 non-null  float64
 28  V29     20000 non-null  float64
 29  V30     20000 non-null  float64
 30  V31     20000 non-null  float64
 31  V32     20000 non-null  float64
 32  V33     20000 non-null  float64
 33  V34     20000 non-null  float64
 34  V35     20000 non-null  float64
 35  V36     20000 non-null  float64
 36  V37     20000 non-null  float64
 37  V38     20000 non-null  float64
 38  V39     20000 non-null  float64
 39  V40     20000 non-null  float64
 40  Target  20000 non-null  int64  
dtypes: float64(40), int64(1)
memory usage: 6.3 MB

# find null values
train_data.isnull().sum()

# find duplicate values
train_data.duplicated().sum()

0

# dataset details
train_data.describe().T

# data head
test_data.head()

# data shape
test_data.shape

(5000, 41)

# data details
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 41 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V1      4995 non-null   float64
 1   V2      4994 non-null   float64
 2   V3      5000 non-null   float64
 3   V4      5000 non-null   float64
 4   V5      5000 non-null   float64
 5   V6      5000 non-null   float64
 6   V7      5000 non-null   float64
 7   V8      5000 non-null   float64
 8   V9      5000 non-null   float64
 9   V10     5000 non-null   float64
 10  V11     5000 non-null   float64
 11  V12     5000 non-null   float64
 12  V13     5000 non-null   float64
 13  V14     5000 non-null   float64
 14  V15     5000 non-null   float64
 15  V16     5000 non-null   float64
 16  V17     5000 non-null   float64
 17  V18     5000 non-null   float64
 18  V19     5000 non-null   float64
 19  V20     5000 non-null   float64
 20  V21     5000 non-null   float64
 21  V22     5000 non-null   float64
 22  V23     5000 non-null   float64
 23  V24     5000 non-null   float64
 24  V25     5000 non-null   float64
 25  V26     5000 non-null   float64
 26  V27     5000 non-null   float64
 27  V28     5000 non-null   float64
 28  V29     5000 non-null   float64
 29  V30     5000 non-null   float64
 30  V31     5000 non-null   float64
 31  V32     5000 non-null   float64
 32  V33     5000 non-null   float64
 33  V34     5000 non-null   float64
 34  V35     5000 non-null   float64
 35  V36     5000 non-null   float64
 36  V37     5000 non-null   float64
 37  V38     5000 non-null   float64
 38  V39     5000 non-null   float64
 39  V40     5000 non-null   float64
 40  Target  5000 non-null   int64  
dtypes: float64(40), int64(1)
memory usage: 1.6 MB

# find null values
test_data.isnull().sum()

# find duplicate values
test_data.duplicated().sum()

0

# dataset details
test_data.describe().T

# function to plot a boxplot and a histogram along the same scale.
def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
    """
    Boxplot and histogram combined

    data: dataframe
    feature: dataframe column
    figsize: size of figure (default (12,7))
    kde: whether to the show density curve (default False)
    bins: number of bins for histogram (default None)
    """
    f2, (ax_box2, ax_hist2) = plt.subplots(
        nrows=2,  # Number of rows of the subplot grid= 2
        sharex=True,  # x-axis will be shared among all subplots
        gridspec_kw={"height_ratios": (0.25, 0.75)},
        figsize=figsize,
    )  # creating the 2 subplots
    sns.boxplot(
        data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
    )  # boxplot will be created and a star will indicate the mean value of the column
    sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
    ) if bins else sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2
    )  # For histogram
    ax_hist2.axvline(
        data[feature].mean(), color="green", linestyle="--"
    )  # Add mean to the histogram
    ax_hist2.axvline(
        data[feature].median(), color="black", linestyle="-"
    )  # Add median to the histogram

for feature in train_data.columns:
    histogram_boxplot(train_data, feature, figsize=(12, 7), kde=False, bins=None) ## Please change the dataframe name as you define while reading the data

train_data['Target'].value_counts(True)

test_data['Target'].value_counts(True)

# Dividing train data into X and y
X = train_data.drop(["Target"], axis=1)
y = train_data["Target"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
# Check the number of rows and columns in the X_train data
print(X_train.shape, X_val.shape)

(16000, 40) (4000, 40)

X_test = test_data.drop(["Target"], axis=1)
y_test = test_data["Target"]

X_test.shape

(5000, 40)

# creating an instace of the imputer to be used
imputer = SimpleImputer(strategy="median")

# Fit on the training data and transform it
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)

# Transform the validation data based on the fit from the training data
X_val = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns)

# Transform the test data based on the fit from the training data
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

print("------- Training -------")
print(X_train.isna().sum())
print("------- Validation -------")
print(X_val.isna().sum())
print("------- Testing -------")
print(X_test.isna().sum())

------- Training -------
V1     0
V2     0
V3     0
V4     0
V5     0
V6     0
V7     0
V8     0
V9     0
V10    0
V11    0
V12    0
V13    0
V14    0
V15    0
V16    0
V17    0
V18    0
V19    0
V20    0
V21    0
V22    0
V23    0
V24    0
V25    0
V26    0
V27    0
V28    0
V29    0
V30    0
V31    0
V32    0
V33    0
V34    0
V35    0
V36    0
V37    0
V38    0
V39    0
V40    0
dtype: int64
------- Validation -------
V1     0
V2     0
V3     0
V4     0
V5     0
V6     0
V7     0
V8     0
V9     0
V10    0
V11    0
V12    0
V13    0
V14    0
V15    0
V16    0
V17    0
V18    0
V19    0
V20    0
V21    0
V22    0
V23    0
V24    0
V25    0
V26    0
V27    0
V28    0
V29    0
V30    0
V31    0
V32    0
V33    0
V34    0
V35    0
V36    0
V37    0
V38    0
V39    0
V40    0
dtype: int64
------- Testing -------
V1     0
V2     0
V3     0
V4     0
V5     0
V6     0
V7     0
V8     0
V9     0
V10    0
V11    0
V12    0
V13    0
V14    0
V15    0
V16    0
V17    0
V18    0
V19    0
V20    0
V21    0
V22    0
V23    0
V24    0
V25    0
V26    0
V27    0
V28    0
V29    0
V30    0
V31    0
V32    0
V33    0
V34    0
V35    0
V36    0
V37    0
V38    0
V39    0
V40    0
dtype: int64

# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
    """
    Function to compute different metrics to check classification model performance

    model: classifier
    predictors: independent variables
    target: dependent variable
    """

    # predicting using the independent variables
    pred = model.predict(predictors)

    acc = accuracy_score(target, pred)  # to compute Accuracy
    recall = recall_score(target, pred)  # to compute Recall
    precision = precision_score(target, pred)  # to compute Precision
    f1 = f1_score(target, pred)  # to compute F1-score

    # creating a dataframe of metrics
    df_perf = pd.DataFrame(
        {
            "Accuracy": acc,
            "Recall": recall,
            "Precision": precision,
            "F1": f1

        },
        index=[0],
    )

    return df_perf

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)

models = []  # Empty list to store all the models

# Appending models into the list
models.append(("DecisionTree", DecisionTreeClassifier(random_state=1)))
models.append(("RandomForest", RandomForestClassifier(random_state=1)))
models.append(("AdaBoost", AdaBoostClassifier(random_state=1)))
models.append(("GradientBoosting", GradientBoostingClassifier(random_state=1)))
models.append(("BaggingClassifier", BaggingClassifier(random_state=1)))
models.append(("LogisticRegression", LogisticRegression(random_state=1, max_iter=10000)))

results_original = []  # Empty list to store all model's CV scores
names_original = []  # Empty list to store name of the models


# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation performance on training dataset:" "\n")

for name, model in models:
    kfold = StratifiedKFold(
        n_splits=5, shuffle=True, random_state=1
    )  # Setting number of splits equal to 5
    cv_result = cross_val_score(
        estimator=model, X=X_train, y=y_train, scoring=scorer, cv=kfold
    )
    results_original.append(cv_result)
    names_original.append(name)
    print(f"{name} - Validation Performance = {cv_result.mean()}")

print("\n" "Validation Performance:" "\n")

for name, model in models:
    model.fit(X_train, y_train)
    scores = recall_score(y_val, model.predict(X_val))
    print(f"{name} - Recall Score = {scores}")

Cross-Validation performance on training dataset:

DecisionTree - Validation Performance = 0.7196280073636767
RandomForest - Validation Performance = 0.7195899193804354
AdaBoost - Validation Performance = 0.5382784231574939
GradientBoosting - Validation Performance = 0.7173363803719928
BaggingClassifier - Validation Performance = 0.7083222243382213
LogisticRegression - Validation Performance = 0.48988129245223133

Validation Performance:

DecisionTree - Recall Score = 0.7387387387387387
RandomForest - Recall Score = 0.7432432432432432
AdaBoost - Recall Score = 0.5630630630630631
GradientBoosting - Recall Score = 0.7432432432432432
BaggingClassifier - Recall Score = 0.7207207207207207
LogisticRegression - Recall Score = 0.49099099099099097

# Plotting boxplots for CV scores of all models defined above
fig = plt.figure(figsize=(10, 7))

fig.suptitle("Algorithm Comparison of models on Original data")
ax = fig.add_subplot(111)

plt.boxplot(results_original)
ax.set_xticklabels(names_original)

plt.show()

print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0)))

# Synthetic Minority Over Sampling Technique
sm = SMOTE(sampling_strategy=1, k_neighbors=5, random_state=1)
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)

print("After OverSampling, counts of label '1': {}".format(sum(y_train_over == 1)))
print("After OverSampling, counts of label '0': {} \n".format(sum(y_train_over == 0)))

print("After OverSampling, the shape of train_X: {}".format(X_train_over.shape))
print("After OverSampling, the shape of train_y: {} \n".format(y_train_over.shape))

Before OverSampling, counts of label '1': 888
Before OverSampling, counts of label '0': 15112 

After OverSampling, counts of label '1': 15112
After OverSampling, counts of label '0': 15112 

After OverSampling, the shape of train_X: (30224, 40)
After OverSampling, the shape of train_y: (30224,)

models = []
models.append(("DecisionTree", DecisionTreeClassifier(random_state=1)))
models.append(("RandomForest", RandomForestClassifier(random_state=1)))
models.append(("AdaBoost", AdaBoostClassifier(random_state=1)))
models.append(("GradientBoosting", GradientBoostingClassifier(random_state=1)))
models.append(("BaggingClassifier", BaggingClassifier(random_state=1)))
models.append(("LogisticRegression", LogisticRegression(random_state=1, max_iter=10000)))  # Increased max_iter for convergence

# To store cross-validation results
results_oversampled = []
# To store model names
names_oversampled= []

# Cross-validation across all models for Oversampled Data
print("\nCross-Validation on Oversampled Data:\n")

# StratifiedKFold setup
kfold_oversampled = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# Cross-validation across all models
for name, model in models:
    cv_result_oversampled = cross_val_score(model, X_train_over, y_train_over, scoring=scorer, cv=kfold_oversampled)
    results_oversampled.append(cv_result_oversampled)
    names_oversampled.append(name)
    print(f"{name} - Validation Performance = {cv_result_oversampled.mean()}")

print("\nValidation Performance on Oversampled Data:\n")

# Fit models on the oversampled training set and evaluate on the original validation set
for name, model in models:
    model.fit(X_train_over, y_train_over)  # Use the oversampled training data
    scores_oversampled = recall_score(y_val, model.predict(X_val))  # Evaluate against the validation set
    print(f"{name} - Recall Score = {scores_oversampled}")

Cross-Validation on Oversampled Data:

DecisionTree - Validation Performance = 0.9732668119313808
RandomForest - Validation Performance = 0.9856406421275405
AdaBoost - Validation Performance = 0.8827421272560054
GradientBoosting - Validation Performance = 0.9239674518302545
BaggingClassifier - Validation Performance = 0.9781630048735123
LogisticRegression - Validation Performance = 0.8812865538044636

Validation Performance on Oversampled Data:

DecisionTree - Recall Score = 0.8198198198198198
RandomForest - Recall Score = 0.8558558558558559
AdaBoost - Recall Score = 0.8603603603603603
GradientBoosting - Recall Score = 0.8783783783783784
BaggingClassifier - Recall Score = 0.8423423423423423
LogisticRegression - Recall Score = 0.8513513513513513

# Plotting boxplots for CV scores of all models evaluated on oversampled data
fig = plt.figure(figsize=(10, 7))

fig.suptitle("Algorithm Comparison of Models Trained on Oversampled Data")
ax = fig.add_subplot(111)

plt.boxplot(results_oversampled)
ax.set_xticklabels(names_oversampled)

plt.show()

print("Before Under Sampling, count of label '1': {}".format(sum(y_train== 1)))
print("Before Under Sampling, count of label '0': {} \n".format(sum(y_train == 0)))

# Random undersampler for under sampling the data
rus = RandomUnderSampler(random_state=1, sampling_strategy=1)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)

print("After Under Sampling, count of label '1': {}".format(sum(y_train_un == 1)))
print("After Under Sampling, count of label '0': {} \n".format(sum(y_train_un == 0)))

print("After Under Sampling, the shape of train_X: {}".format(X_train_un.shape))
print("After Under Sampling, the shape of train_y: {} \n".format(y_train_un.shape))

Before Under Sampling, count of label '1': 888
Before Under Sampling, count of label '0': 15112 

After Under Sampling, count of label '1': 888
After Under Sampling, count of label '0': 888 

After Under Sampling, the shape of train_X: (1776, 40)
After Under Sampling, the shape of train_y: (1776,)

models_undersampled = []
models_undersampled.append(("DecisionTree", DecisionTreeClassifier(random_state=1)))
models_undersampled.append(("RandomForest", RandomForestClassifier(random_state=1)))
models_undersampled.append(("AdaBoost", AdaBoostClassifier(random_state=1)))
models_undersampled.append(("GradientBoosting", GradientBoostingClassifier(random_state=1)))
models_undersampled.append(("BaggingClassifier", BaggingClassifier(random_state=1)))
models_undersampled.append(("LogisticRegression", LogisticRegression(random_state=1, max_iter=10000)))  # Increased max_iter for convergence

# To store cross-validation results for undersampled data
results_undersampled = []
# To store model names for undersampled data
names_undersampled = []

# Cross-validation across all models for Undersampled Data
print("\nCross-Validation on Undersampled Data:\n")

# StratifiedKFold setup for undersampled data
kfold_undersampled = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# Cross-validation across all models
for name, model in models_undersampled:
    cv_result_undersampled = cross_val_score(model, X_train_un, y_train_un, scoring=scorer, cv=kfold_undersampled)
    results_undersampled.append(cv_result_undersampled)
    names_undersampled.append(name)
    print(f"{name} - Validation Performance = {cv_result_undersampled.mean()}")

print("\nValidation Performance on Undersampled Data:\n")

# Fit models on the undersampled training set and evaluate on the original validation set
for name, model in models_undersampled:
    model.fit(X_train_un, y_train_un)  # Use the undersampled training data
    scores_undersampled = recall_score(y_val, model.predict(X_val))  # Evaluate against the validation set
    print(f"{name} - Recall Score = {scores_undersampled}")

Cross-Validation on Undersampled Data:

DecisionTree - Validation Performance = 0.8468355233923697
RandomForest - Validation Performance = 0.8975052370976957
AdaBoost - Validation Performance = 0.8569859709261728
GradientBoosting - Validation Performance = 0.8907446200723672
BaggingClassifier - Validation Performance = 0.8704627689963816
LogisticRegression - Validation Performance = 0.8513235574176348

Validation Performance on Undersampled Data:

DecisionTree - Recall Score = 0.8468468468468469
RandomForest - Recall Score = 0.8783783783783784
AdaBoost - Recall Score = 0.8558558558558559
GradientBoosting - Recall Score = 0.8873873873873874
BaggingClassifier - Recall Score = 0.8918918918918919
LogisticRegression - Recall Score = 0.8648648648648649

# Plotting boxplots for CV scores of all models evaluated on  undersampled data
fig = plt.figure(figsize=(10, 7))

fig.suptitle("Algorithm Comparison of Models Trained on Undersampled Data")
ax = fig.add_subplot(111)

plt.boxplot(results_undersampled)
ax.set_xticklabels(names_undersampled)

plt.show()

# defining model
Model = RandomForestClassifier(random_state=1)

# Parameter grid to pass in RandomSearchCV
param_grid = {
    "n_estimators": [200,250,300],
    "min_samples_leaf": np.arange(1, 4),
    "max_features": [np.arange(0.3, 0.6, 0.1),'sqrt'],
    "max_samples": np.arange(0.4, 0.7, 0.1)}


#Calling RandomizedSearchCV
randomized_rf_orig = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, n_iter=50, n_jobs = -1, scoring=scorer, cv=5, random_state=1)

#Fitting parameters in RandomizedSearchCV
randomized_rf_orig.fit(X_train, y_train)

print("Best parameters are {} with CV score={}:" .format(randomized_rf_orig.best_params_,randomized_rf_orig.best_score_))

Best parameters are {'n_estimators': 300, 'min_samples_leaf': 1, 'max_samples': 0.6, 'max_features': 'sqrt'} with CV score=0.7038786262934045:

rf_orig = RandomForestClassifier(
  n_estimators= 300, min_samples_leaf= 1, max_samples= 0.6, max_features= 'sqrt'
)
# Fit the model on training data
rf_orig.fit(X_train, y_train)

RandomForestClassifier(max_samples=0.6, n_estimators=300)

RandomForestClassifier(max_samples=0.6, n_estimators=300)

# Calculating different metrics on train set
rf_orig_train_perf = model_performance_classification_sklearn(rf_orig , X_train, y_train)
rf_orig_train_perf

# Calculating different metrics on validation set
rf_orig_val_perf = model_performance_classification_sklearn(rf_orig , X_val, y_val)
print("Validation performance:")
rf_orig_val_perf

Validation performance:

# Oversampling
Model = RandomForestClassifier(random_state=1)

# Parameter grid to pass in RandomSearchCV
param_grid = { "n_estimators": [200,250,300],
              "min_samples_leaf": np.arange(1, 4),
               "max_features": [np.arange(0.3, 0.6, 0.1),'sqrt'],
               "max_samples": np.arange(0.4, 0.7, 0.1) }

#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, n_iter=10, n_jobs = -1, scoring=scorer, cv=5, random_state=1)

#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train_over,y_train_over)

print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))

Best parameters are {'n_estimators': 300, 'min_samples_leaf': 1, 'max_samples': 0.6, 'max_features': 'sqrt'} with CV score=0.9808099737442019:

rf_over = RandomForestClassifier(
  n_estimators= 300, min_samples_leaf= 1, max_samples= 0.6, max_features= 'sqrt'
)
# Fit the model on training data
rf_over.fit(X_train, y_train)

RandomForestClassifier(max_samples=0.6, n_estimators=300)

RandomForestClassifier(max_samples=0.6, n_estimators=300)

# Calculating different metrics on train set
rf_over_train_perf = model_performance_classification_sklearn(rf_over , X_train_over, y_train_over)
rf_over_train_perf

# Calculating different metrics on validation set
rf_over_val_perf = model_performance_classification_sklearn(rf_over , X_val, y_val)
print("Validation performance:")
rf_over_val_perf

Validation performance:

# Undersampling
Model = RandomForestClassifier(random_state=1)

# Parameter grid to pass in RandomSearchCV
param_grid = { "n_estimators": [200,250,300],
              "min_samples_leaf": np.arange(1, 4),
               "max_features": [np.arange(0.3, 0.6, 0.1),'sqrt'],
               "max_samples": np.arange(0.4, 0.7, 0.1) }

#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, n_iter=10, n_jobs = -1, scoring=scorer, cv=5, random_state=1)

#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train_un,y_train_un)

print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))

Best parameters are {'n_estimators': 250, 'min_samples_leaf': 2, 'max_samples': 0.5, 'max_features': 'sqrt'} with CV score=0.8941979305529106:

rf_un = RandomForestClassifier(
n_estimators= 250, min_samples_leaf= 2, max_samples= 0.5, max_features= 'sqrt'
)
# Fit the model on training data
rf_un.fit(X_train, y_train)

RandomForestClassifier(max_samples=0.5, min_samples_leaf=2, n_estimators=250)

RandomForestClassifier(max_samples=0.5, min_samples_leaf=2, n_estimators=250)

# Calculating different metrics on train set
rf_un_train_perf = model_performance_classification_sklearn(rf_un  , X_train_un, y_train_un)
rf_un_train_perf

# Calculating different metrics on validation set
rf_un_val_perf = model_performance_classification_sklearn(rf_un  , X_val, y_val)
print("Validation performance:")
rf_un_val_perf

Validation performance:

# Original
Model = GradientBoostingClassifier(random_state=1)

# Parameter grid to pass in RandomSearchCV
param_grid = { "n_estimators": np.arange(100,150,25),
              "learning_rate": [0.2, 0.05, 1],
               "subsample":[0.5,0.7],
               "max_features":[0.5,0.7] }

#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, n_iter=10, n_jobs = -1, scoring=scorer, cv=5, random_state=1)

#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train,y_train)

print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))

Best parameters are {'subsample': 0.7, 'n_estimators': 125, 'max_features': 0.5, 'learning_rate': 0.2} with CV score=0.7557354154764171:

gb_orig = GradientBoostingClassifier(
  n_estimators= 125, subsample= 0.7, max_features= 0.5, learning_rate= 0.2
)
# Fit the model on training data
gb_orig.fit(X_train, y_train)

GradientBoostingClassifier(learning_rate=0.2, max_features=0.5,
                           n_estimators=125, subsample=0.7)

GradientBoostingClassifier(learning_rate=0.2, max_features=0.5,
                           n_estimators=125, subsample=0.7)

# Calculating different metrics on train set
gb_orig_train_perf = model_performance_classification_sklearn(gb_orig , X_train, y_train)
gb_orig_train_perf

# Calculating different metrics on validation set
gb_orig_val_perf = model_performance_classification_sklearn(gb_orig , X_val, y_val)
print("Validation performance:")
gb_orig_val_perf

Validation performance:

# Oversampling
Model = GradientBoostingClassifier(random_state=1)

# Parameter grid to pass in RandomSearchCV
param_grid = { "n_estimators": np.arange(100,150,25),
              "learning_rate": [0.2, 0.05, 1],
               "subsample":[0.5,0.7],
               "max_features":[0.5,0.7] }

#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, n_iter=10, n_jobs = -1, scoring=scorer, cv=5, random_state=1)

#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train_over,y_train_over)

print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))

Best parameters are {'subsample': 0.7, 'n_estimators': 125, 'max_features': 0.5, 'learning_rate': 1} with CV score=0.9675751074981506:

gb_over = GradientBoostingClassifier(
  subsample= 0.7, n_estimators= 125, max_features= 0.5, learning_rate= 1
)
# Fit the model on training data
gb_over.fit(X_train, y_train)

GradientBoostingClassifier(learning_rate=1, max_features=0.5, n_estimators=125,
                           subsample=0.7)

GradientBoostingClassifier(learning_rate=1, max_features=0.5, n_estimators=125,
                           subsample=0.7)

# Calculating different metrics on train set
gb_over_train_perf = model_performance_classification_sklearn(gb_over , X_train_over, y_train_over)
gb_over_train_perf

# Calculating different metrics on validation set
gb_over_val_perf = model_performance_classification_sklearn(gb_over , X_val, y_val)
print("Validation performance:")
gb_over_val_perf

Validation performance:

# Undersampling
Model = GradientBoostingClassifier(random_state=1)

# Parameter grid to pass in RandomSearchCV
param_grid = { "n_estimators": np.arange(100,150,25),
              "learning_rate": [0.2, 0.05, 1],
               "subsample":[0.5,0.7],
               "max_features":[0.5,0.7] }

#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, n_iter=10, n_jobs = -1, scoring=scorer, cv=5, random_state=1)

#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train_un,y_train_un)

print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))

Best parameters are {'subsample': 0.7, 'n_estimators': 125, 'max_features': 0.5, 'learning_rate': 0.2} with CV score=0.9020567510950295:

gb_un = GradientBoostingClassifier(
  subsample= 0.7,
  n_estimators= 125,
  max_features= 0.5,
  learning_rate= 0.2
)
# Fit the model on training data
gb_un.fit(X_train, y_train)

GradientBoostingClassifier(learning_rate=0.2, max_features=0.5,
                           n_estimators=125, subsample=0.7)

GradientBoostingClassifier(learning_rate=0.2, max_features=0.5,
                           n_estimators=125, subsample=0.7)

# Calculating different metrics on train set
gb_un_train_perf = model_performance_classification_sklearn(gb_un , X_train_un, y_train_un)
gb_un_train_perf

# Calculating different metrics on validation set
gb_un_val_perf = model_performance_classification_sklearn(gb_un , X_val, y_val)
print("Validation performance:")
gb_un_val_perf

Validation performance:

# Original
Model = BaggingClassifier(random_state=1)

# Parameter grid to pass in RandomSearchCV
param_grid = { 'max_samples': [0.8,0.9,1],
              'max_features': [0.7,0.8,0.9],
               'n_estimators' : [30,50,70], }

#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, n_iter=10, n_jobs = -1, scoring=scorer, cv=5, random_state=1)

#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train,y_train)

print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))

Best parameters are {'n_estimators': 30, 'max_samples': 0.9, 'max_features': 0.9} with CV score=0.728648511394655:

bc_orig = BaggingClassifier(
  n_estimators= 30,
  max_samples= 0.9,
  max_features= 0.9
)
# Fit the model on training data
bc_orig.fit(X_train, y_train)

BaggingClassifier(max_features=0.9, max_samples=0.9, n_estimators=30)

BaggingClassifier(max_features=0.9, max_samples=0.9, n_estimators=30)

# Calculating different metrics on train set
bc_orig_train_perf = model_performance_classification_sklearn(bc_orig , X_train, y_train)
bc_orig_train_perf

# Calculating different metrics on validation set
bc_orig_val_perf = model_performance_classification_sklearn(bc_orig , X_val, y_val)
print("Validation performance:")
bc_orig_val_perf

Validation performance:

# Oversampling
Model = BaggingClassifier(random_state=1)

# Parameter grid to pass in RandomSearchCV
param_grid = { 'max_samples': [0.8,0.9,1],
              'max_features': [0.7,0.8,0.9],
               'n_estimators' : [30,50,70], }

#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, n_iter=10, n_jobs = -1, scoring=scorer, cv=5, random_state=1)

#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train_over,y_train_over)

print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))

Best parameters are {'n_estimators': 70, 'max_samples': 0.9, 'max_features': 0.9} with CV score=0.9835230801665501:

bc_over = BaggingClassifier(
  n_estimators = 70,
  max_samples = 0.9,
  max_features = 0.9
)
# Fit the model on training data
bc_over.fit(X_train, y_train)

BaggingClassifier(max_features=0.9, max_samples=0.9, n_estimators=70)

BaggingClassifier(max_features=0.9, max_samples=0.9, n_estimators=70)

# Calculating different metrics on train set
bc_over_train_perf = model_performance_classification_sklearn(bc_over , X_train_over, y_train_over)
bc_over_train_perf

# Calculating different metrics on validation set
bc_over_val_perf = model_performance_classification_sklearn(bc_over , X_val, y_val)
print("Validation performance:")
bc_over_val_perf

Validation performance:

# Undersampling
Model = BaggingClassifier(random_state=1)

# Parameter grid to pass in RandomSearchCV
param_grid = { 'max_samples': [0.8,0.9,1],
              'max_features': [0.7,0.8,0.9],
               'n_estimators' : [30,50,70], }

#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=Model, param_distributions=param_grid, n_iter=10, n_jobs = -1, scoring=scorer, cv=5, random_state=1)

#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train_un,y_train_un)

print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))

Best parameters are {'n_estimators': 70, 'max_samples': 0.8, 'max_features': 0.7} with CV score=0.8953215260585285:

bc_un = BaggingClassifier(
  n_estimators = 70,
  max_samples = 0.8,
  max_features = 0.7
)
# Fit the model on training data
bc_un.fit(X_train, y_train)

BaggingClassifier(max_features=0.7, max_samples=0.8, n_estimators=70)

BaggingClassifier(max_features=0.7, max_samples=0.8, n_estimators=70)

# Calculating different metrics on train set
bc_un_train_perf = model_performance_classification_sklearn(bc_un , X_train_un, y_train_un)
bc_un_train_perf

# Calculating different metrics on validation set
bc_un_val_perf = model_performance_classification_sklearn(bc_un , X_val, y_val)
print("Validation performance:")
bc_un_val_perf

Validation performance:

models_train_comp_df = pd.concat(
    [
      rf_orig_train_perf.T,
      gb_orig_train_perf.T,
      bc_orig_train_perf.T,
      rf_over_train_perf.T,
      gb_over_train_perf.T,
      bc_over_train_perf.T,
      rf_un_train_perf.T,
      gb_un_train_perf.T,
      bc_un_train_perf.T,
    ],
    axis=1,
)
models_train_comp_df.columns = [
      'rf_orig_train_perf',
      'gb_orig_train_perf',
      'bc_orig_train_perf',
      'rf_over_train_perf',
      'gb_over_train_perf',
      'bc_over_train_perf',
      'rf_un_train_perf',
      'gb_un_train_perf',
      'bc_un_train_perf'
]
print("Training performance comparison:")
models_train_comp_df

Training performance comparison:

models_val_comp_df = pd.concat(
    [
      rf_orig_val_perf.T,
      gb_orig_val_perf.T,
      bc_orig_val_perf.T,
      rf_over_val_perf.T,
      gb_over_val_perf.T,
      bc_over_val_perf.T,
      rf_un_val_perf.T,
      gb_un_val_perf.T,
      bc_un_val_perf.T,
    ],
    axis=1,
)
models_val_comp_df.columns = [
    'rf_orig_val_perf',
    'gb_orig_val_perf',
    'bc_orig_val_perf',
    'rf_over_val_perf',
    'gb_over_val_perf',
    'bc_over_val_perf',
    'rf_un_val_perf',
    'gb_un_val_perf',
    'bc_un_val_perf'
]
print("Validation performance comparison:")
models_val_comp_df

Validation performance comparison:

X_test = test_data.drop(["Target"], axis=1)
y_test = test_data["Target"]

# Calculating different metrics on the test set
bcun_grid_test = model_performance_classification_sklearn(bc_un, X_test, y_test)
print("Test performance:")
bcun_grid_test

Test performance:

feature_names = X_train.columns
importances = bc_un.estimators_
importances = np.mean([
    tree.feature_importances_ for tree in importances
], axis=0)
indices = np.argsort(importances)

plt.figure(figsize=(8, 8))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="green", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()

# creating a list of numerical variables
numerical_features = ["V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10",
                      "V11", "V12", "V13", "V14", "V15", "V16", "V17", "V18", "V19",
                      "V20", "V21", "V22","V23", "V24", "V25","V26", "V27", "V28", "V29",
                      "V30", "V31","V32", "V33", "V34","V35", "V36", "V37","V38", "V39", "V40"]

# creating a transformer for numerical variables, which will apply simple imputer on the numerical variables
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])

# handle_unknown = "ignore", allows model to handle any unknown category in the test data

# combining categorical transformer and numerical transformer using a column transformer

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
    ],
    remainder="passthrough",
)
# remainder = "passthrough" has been used, it will allow variables that are present in original data
# but not in "numerical_columns" and "categorical_columns" to pass through the column transformer without any changes

X = test_data.drop(columns="Target")
Y = test_data["Target"]

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.20, random_state=1, stratify=Y
)
print(X_train.shape, X_test.shape)

(4000, 40) (1000, 40)

# Creating new pipeline with best parameters
model = Pipeline(
    steps=[
        ("pre", preprocessor),
        (
            "BC",
            BaggingClassifier(
              n_estimators = 70,
              max_samples = 0.8,
              max_features = 0.7,
            ),
        ),
    ]
)
# Fit the model on training data
model.fit(X_train, y_train)

Pipeline(steps=[('pre',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['V1', 'V2', 'V3', 'V4', 'V5',
                                                   'V6', 'V7', 'V8', 'V9',
                                                   'V10', 'V11', 'V12', 'V13',
                                                   'V14', 'V15', 'V16', 'V17',
                                                   'V18', 'V19', 'V20', 'V21',
                                                   'V22', 'V23', 'V24', 'V25',
                                                   'V26', 'V27', 'V28', 'V29',
                                                   'V30', ...])])),
                ('BC',
                 BaggingClassifier(max_features=0.7, max_samples=0.8,
                                   n_estimators=70))])

Pipeline(steps=[('pre',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['V1', 'V2', 'V3', 'V4', 'V5',
                                                   'V6', 'V7', 'V8', 'V9',
                                                   'V10', 'V11', 'V12', 'V13',
                                                   'V14', 'V15', 'V16', 'V17',
                                                   'V18', 'V19', 'V20', 'V21',
                                                   'V22', 'V23', 'V24', 'V25',
                                                   'V26', 'V27', 'V28', 'V29',
                                                   'V30', ...])])),
                ('BC',
                 BaggingClassifier(max_features=0.7, max_samples=0.8,
                                   n_estimators=70))])

ColumnTransformer(remainder='passthrough',
                  transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7',
                                  'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14',
                                  'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
                                  'V21', 'V22', 'V23', 'V24', 'V25', 'V26',
                                  'V27', 'V28', 'V29', 'V30', ...])])

['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40']

SimpleImputer(strategy='median')

[]

passthrough

BaggingClassifier(max_features=0.7, max_samples=0.8, n_estimators=70)

# Let's check the performance on test set
Model_test = model_performance_classification_sklearn(model, X_test, y_test)
Model_test

	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12	V13	V14	V15	V16	V17	V18	V19	V20	V21	V22	V23	V24	V25	V26	V27	V28	V29	V30	V31	V32	V33	V34	V35	V36	V37	V38	V39	V40
0	-4.465	-4.679	3.102	0.506	-0.221	-2.033	-2.911	0.051	-1.522	3.762	-5.715	0.736	0.981	1.418	-3.376	-3.047	0.306	2.914	2.270	4.395	-2.388	0.646	-1.191	3.133	0.665	-2.511	-0.037	0.726	-3.982	-1.073	1.667	3.060	-1.690	2.846	2.235	6.667	0.444	-2.369	2.951	-3.480
1	3.366	3.653	0.910	-1.368	0.332	2.359	0.733	-4.332	0.566	-0.101	1.914	-0.951	-1.255	-2.707	0.193	-4.769	-2.205	0.908	0.757	-5.834	-3.065	1.597	-1.757	1.766	-0.267	3.625	1.500	-0.586	0.783	-0.201	0.025	-1.795	3.033	-2.468	1.895	-2.298	-1.731	5.909	-0.386	0.616
2	-3.832	-5.824	0.634	-2.419	-1.774	1.017	-2.099	-3.173	-2.082	5.393	-0.771	1.107	1.144	0.943	-3.164	-4.248	-4.039	3.689	3.311	1.059	-2.143	1.650	-1.661	1.680	-0.451	-4.551	3.739	1.134	-2.034	0.841	-1.600	-0.257	0.804	4.086	2.292	5.361	0.352	2.940	3.839	-4.309
3	1.618	1.888	7.046	-1.147	0.083	-1.530	0.207	-2.494	0.345	2.119	-3.053	0.460	2.705	-0.636	-0.454	-3.174	-3.404	-1.282	1.582	-1.952	-3.517	-1.206	-5.628	-1.818	2.124	5.295	4.748	-2.309	-3.963	-6.029	4.949	-3.584	-2.577	1.364	0.623	5.550	-1.527	0.139	3.101	-1.277
4	-0.111	3.872	-3.758	-2.983	3.793	0.545	0.205	4.849	-1.855	-6.220	1.998	4.724	0.709	-1.989	-2.633	4.184	2.245	3.734	-6.313	-5.380	-0.887	2.062	9.446	4.490	-3.945	4.582	-8.780	-3.383	5.107	6.788	2.044	8.266	6.629	-10.069	1.223	-3.230	1.687	-2.164	-3.645	6.510

	count	mean	std	min	25%	50%	75%	max
V1	19982.000	-0.272	3.442	-11.876	-2.737	-0.748	1.840	15.493
V2	19982.000	0.440	3.151	-12.320	-1.641	0.472	2.544	13.089
V3	20000.000	2.485	3.389	-10.708	0.207	2.256	4.566	17.091
V4	20000.000	-0.083	3.432	-15.082	-2.348	-0.135	2.131	13.236
V5	20000.000	-0.054	2.105	-8.603	-1.536	-0.102	1.340	8.134
V6	20000.000	-0.995	2.041	-10.227	-2.347	-1.001	0.380	6.976
V7	20000.000	-0.879	1.762	-7.950	-2.031	-0.917	0.224	8.006
V8	20000.000	-0.548	3.296	-15.658	-2.643	-0.389	1.723	11.679
V9	20000.000	-0.017	2.161	-8.596	-1.495	-0.068	1.409	8.138
V10	20000.000	-0.013	2.193	-9.854	-1.411	0.101	1.477	8.108
V11	20000.000	-1.895	3.124	-14.832	-3.922	-1.921	0.119	11.826
V12	20000.000	1.605	2.930	-12.948	-0.397	1.508	3.571	15.081
V13	20000.000	1.580	2.875	-13.228	-0.224	1.637	3.460	15.420
V14	20000.000	-0.951	1.790	-7.739	-2.171	-0.957	0.271	5.671
V15	20000.000	-2.415	3.355	-16.417	-4.415	-2.383	-0.359	12.246
V16	20000.000	-2.925	4.222	-20.374	-5.634	-2.683	-0.095	13.583
V17	20000.000	-0.134	3.345	-14.091	-2.216	-0.015	2.069	16.756
V18	20000.000	1.189	2.592	-11.644	-0.404	0.883	2.572	13.180
V19	20000.000	1.182	3.397	-13.492	-1.050	1.279	3.493	13.238
V20	20000.000	0.024	3.669	-13.923	-2.433	0.033	2.512	16.052
V21	20000.000	-3.611	3.568	-17.956	-5.930	-3.533	-1.266	13.840
V22	20000.000	0.952	1.652	-10.122	-0.118	0.975	2.026	7.410
V23	20000.000	-0.366	4.032	-14.866	-3.099	-0.262	2.452	14.459
V24	20000.000	1.134	3.912	-16.387	-1.468	0.969	3.546	17.163
V25	20000.000	-0.002	2.017	-8.228	-1.365	0.025	1.397	8.223
V26	20000.000	1.874	3.435	-11.834	-0.338	1.951	4.130	16.836
V27	20000.000	-0.612	4.369	-14.905	-3.652	-0.885	2.189	17.560
V28	20000.000	-0.883	1.918	-9.269	-2.171	-0.891	0.376	6.528
V29	20000.000	-0.986	2.684	-12.579	-2.787	-1.176	0.630	10.722
V30	20000.000	-0.016	3.005	-14.796	-1.867	0.184	2.036	12.506
V31	20000.000	0.487	3.461	-13.723	-1.818	0.490	2.731	17.255
V32	20000.000	0.304	5.500	-19.877	-3.420	0.052	3.762	23.633
V33	20000.000	0.050	3.575	-16.898	-2.243	-0.066	2.255	16.692
V34	20000.000	-0.463	3.184	-17.985	-2.137	-0.255	1.437	14.358
V35	20000.000	2.230	2.937	-15.350	0.336	2.099	4.064	15.291
V36	20000.000	1.515	3.801	-14.833	-0.944	1.567	3.984	19.330
V37	20000.000	0.011	1.788	-5.478	-1.256	-0.128	1.176	7.467
V38	20000.000	-0.344	3.948	-17.375	-2.988	-0.317	2.279	15.290
V39	20000.000	0.891	1.753	-6.439	-0.272	0.919	2.058	7.760
V40	20000.000	-0.876	3.012	-11.024	-2.940	-0.921	1.120	10.654
Target	20000.000	0.056	0.229	0.000	0.000	0.000	0.000	1.000

	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12	V13	V14	V15	V16	V17	V18	V19	V20	V21	V22	V23	V24	V25	V26	V27	V28	V29	V30	V31	V32	V33	V34	V35	V36	V37	V38	V39	V40
0	-0.613	-3.820	2.202	1.300	-1.185	-4.496	-1.836	4.723	1.206	-0.342	-5.123	1.017	4.819	3.269	-2.984	1.387	2.032	-0.512	-1.023	7.339	-2.242	0.155	2.054	-2.772	1.851	-1.789	-0.277	-1.255	-3.833	-1.505	1.587	2.291	-5.411	0.870	0.574	4.157	1.428	-10.511	0.455	-1.448
1	0.390	-0.512	0.527	-2.577	-1.017	2.235	-0.441	-4.406	-0.333	1.967	1.797	0.410	0.638	-1.390	-1.883	-5.018	-3.827	2.418	1.762	-3.242	-3.193	1.857	-1.708	0.633	-0.588	0.084	3.014	-0.182	0.224	0.865	-1.782	-2.475	2.494	0.315	2.059	0.684	-0.485	5.128	1.721	-1.488
2	-0.875	-0.641	4.084	-1.590	0.526	-1.958	-0.695	1.347	-1.732	0.466	-4.928	3.565	-0.449	-0.656	-0.167	-1.630	2.292	2.396	0.601	1.794	-2.120	0.482	-0.841	1.790	1.874	0.364	-0.169	-0.484	-2.119	-2.157	2.907	-1.319	-2.997	0.460	0.620	5.632	1.324	-1.752	1.808	1.676
3	0.238	1.459	4.015	2.534	1.197	-3.117	-0.924	0.269	1.322	0.702	-5.578	-0.851	2.591	0.767	-2.391	-2.342	0.572	-0.934	0.509	1.211	-3.260	0.105	-0.659	1.498	1.100	4.143	-0.248	-1.137	-5.356	-4.546	3.809	3.518	-3.074	-0.284	0.955	3.029	-1.367	-3.412	0.906	-2.451
4	5.828	2.768	-1.235	2.809	-1.642	-1.407	0.569	0.965	1.918	-2.775	-0.530	1.375	-0.651	-1.679	-0.379	-4.443	3.894	-0.608	2.945	0.367	-5.789	4.598	4.450	3.225	0.397	0.248	-2.362	1.079	-0.473	2.243	-3.591	1.774	-1.502	-2.227	4.777	-6.560	-0.806	-0.276	-3.858	-0.538

	count	mean	std	min	25%	50%	75%	max
V1	4995.000	-0.278	3.466	-12.382	-2.744	-0.765	1.831	13.504
V2	4994.000	0.398	3.140	-10.716	-1.649	0.427	2.444	14.079
V3	5000.000	2.552	3.327	-9.238	0.315	2.260	4.587	15.315
V4	5000.000	-0.049	3.414	-14.682	-2.293	-0.146	2.166	12.140
V5	5000.000	-0.080	2.111	-7.712	-1.615	-0.132	1.341	7.673
V6	5000.000	-1.042	2.005	-8.924	-2.369	-1.049	0.308	5.068
V7	5000.000	-0.908	1.769	-8.124	-2.054	-0.940	0.212	7.616
V8	5000.000	-0.575	3.332	-12.253	-2.642	-0.358	1.713	10.415
V9	5000.000	0.030	2.174	-6.785	-1.456	-0.080	1.450	8.851
V10	5000.000	0.019	2.145	-8.171	-1.353	0.166	1.511	6.599
V11	5000.000	-2.009	3.112	-13.152	-4.050	-2.043	0.044	9.956
V12	5000.000	1.576	2.907	-8.164	-0.450	1.488	3.563	12.984
V13	5000.000	1.622	2.883	-11.548	-0.126	1.719	3.465	12.620
V14	5000.000	-0.921	1.803	-7.814	-2.111	-0.896	0.272	5.734
V15	5000.000	-2.452	3.387	-15.286	-4.479	-2.417	-0.433	11.673
V16	5000.000	-3.019	4.264	-20.986	-5.648	-2.774	-0.178	13.976
V17	5000.000	-0.104	3.337	-13.418	-2.228	0.047	2.112	19.777
V18	5000.000	1.196	2.586	-12.214	-0.409	0.881	2.604	13.642
V19	5000.000	1.210	3.385	-14.170	-1.026	1.296	3.526	12.428
V20	5000.000	0.138	3.657	-13.720	-2.325	0.193	2.540	13.871
V21	5000.000	-3.664	3.578	-16.341	-5.944	-3.663	-1.330	11.047
V22	5000.000	0.962	1.640	-6.740	-0.048	0.986	2.029	7.505
V23	5000.000	-0.422	4.057	-14.422	-3.163	-0.279	2.426	13.181
V24	5000.000	1.089	3.968	-12.316	-1.623	0.913	3.537	17.806
V25	5000.000	0.061	2.010	-6.770	-1.298	0.077	1.428	6.557
V26	5000.000	1.847	3.400	-11.414	-0.242	1.917	4.156	17.528
V27	5000.000	-0.552	4.403	-13.177	-3.663	-0.872	2.247	17.290
V28	5000.000	-0.868	1.926	-7.933	-2.160	-0.931	0.421	7.416
V29	5000.000	-1.096	2.655	-9.988	-2.861	-1.341	0.522	14.039
V30	5000.000	-0.119	3.023	-12.438	-1.997	0.112	1.946	10.315
V31	5000.000	0.469	3.446	-11.263	-1.822	0.486	2.779	12.559
V32	5000.000	0.233	5.586	-17.244	-3.556	-0.077	3.752	26.539
V33	5000.000	-0.080	3.539	-14.904	-2.348	-0.160	2.099	13.324
V34	5000.000	-0.393	3.166	-14.700	-2.010	-0.172	1.465	12.146
V35	5000.000	2.211	2.948	-12.261	0.322	2.112	4.032	13.489
V36	5000.000	1.595	3.775	-12.736	-0.866	1.703	4.104	17.116
V37	5000.000	0.023	1.785	-5.079	-1.241	-0.110	1.238	6.810
V38	5000.000	-0.406	3.969	-15.335	-2.984	-0.381	2.288	13.065
V39	5000.000	0.939	1.717	-5.451	-0.208	0.959	2.131	7.182
V40	5000.000	-0.932	2.978	-10.076	-2.987	-1.003	1.080	8.698
Target	5000.000	0.056	0.231	0.000	0.000	0.000	0.000	1.000

	proportion
Target
0	0.945
1	0.056

	rf_orig_train_perf	gb_orig_train_perf	bc_orig_train_perf	rf_over_train_perf	gb_over_train_perf	bc_over_train_perf	rf_un_train_perf	gb_un_train_perf	bc_un_train_perf
Accuracy	0.996	0.994	0.999	0.902	0.851	0.913	0.916	0.951	0.992
Recall	0.919	0.902	0.980	0.803	0.708	0.826	0.832	0.904	0.984
Precision	1.000	0.985	1.000	1.000	0.991	1.000	1.000	0.998	1.000
F1	0.958	0.942	0.990	0.891	0.826	0.905	0.908	0.949	0.992

	rf_orig_val_perf	gb_orig_val_perf	bc_orig_val_perf	rf_over_val_perf	gb_over_val_perf	bc_over_val_perf	rf_un_val_perf	gb_un_val_perf	bc_un_val_perf
Accuracy	0.984	0.983	0.983	0.985	0.968	0.984	0.983	0.981	0.985
Recall	0.730	0.766	0.725	0.739	0.617	0.739	0.712	0.752	0.743
Precision	0.982	0.909	0.947	0.982	0.761	0.965	0.981	0.888	0.982
F1	0.837	0.831	0.821	0.843	0.682	0.837	0.825	0.815	0.846

ReneWind¶

Problem Statement¶

Business Context¶

Objective¶

Data Description¶

Importing necessary libraries¶

Loading the dataset¶

Data Overview¶

Train data observations¶

Test data observations¶

Exploratory Data Analysis (EDA)¶

Univariate Analysis¶

Plotting histograms and boxplots for all the variables¶

Plotting all the features at one go¶

Data distribution¶

Observations:¶

Data Pre-processing¶

Splitting train dataset into training and validation set¶

Splitting test data into X_test and y_test¶

Missing value imputation¶

Model Building¶

Model evaluation criterion¶

Defining scorer to be used for cross-validation and hyperparameter tuning¶

Model Building with original data¶

Model Building with Oversampled data¶

Model Building with Undersampled data¶

HyperparameterTuning¶

Sample Parameter Grids¶

Random Forest¶

Hyperparameter tuning of Random Forest on Original Data¶

Hyperparameter tuning of Random Forest on oversampled data¶

Hyperparameter tuning of Random Forest on undersampled data¶

Gradient Boosting¶

Hyperparameter tuning of Gradient boosting on Original Data¶

Hyperparameter tuning of Gradient boosting on Oversampled data¶

Hyperparameter tuning of Gradient boosting on Undersampled Data¶

Bagging Classifier¶

Hyperparameter tuning of Bagging Classifier boosting on Original Data¶

Hyperparameter tuning of Bagging classifier on Oversampled data¶

Hyperparameter tuning of Bagging classifier on Undersampled Data¶

Model performance comparison and choosing the final model¶

Validation in Training dataset¶

Validation in Validation dataset¶

Test set final performance¶

Pipelines to build the final model¶

Business Insights and Conclusions¶

`Train data observations`¶

`Test data observations`¶