import warnings
warnings.filterwarnings("ignore")

# Libraries for data manipulation and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree


# Scalers for normalization data
from sklearn.preprocessing import MinMaxScaler

# Splitting data into train and test
from sklearn.model_selection import train_test_split

# Algorithms to use
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.neighbors import KNeighborsClassifier


# Metrics to evaluate the model
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, recall_score, precision_score, accuracy_score
from sklearn.metrics import precision_recall_curve


# For hyperparameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score, make_scorer
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from sklearn.impute import KNNImputer
import scipy.stats as stats


learn = pd.read_csv("hmeq.csv")


# Copying data to another variable to avoid any changes to the original data
data = learn.copy()


data.head()


data.tail()


data.shape

(5960, 13)


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5960 entries, 0 to 5959
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   BAD      5960 non-null   int64  
 1   LOAN     5960 non-null   int64  
 2   MORTDUE  5442 non-null   float64
 3   VALUE    5848 non-null   float64
 4   REASON   5708 non-null   object 
 5   JOB      5681 non-null   object 
 6   YOJ      5445 non-null   float64
 7   DEROG    5252 non-null   float64
 8   DELINQ   5380 non-null   float64
 9   CLAGE    5652 non-null   float64
 10  NINQ     5450 non-null   float64
 11  CLNO     5738 non-null   float64
 12  DEBTINC  4693 non-null   float64
dtypes: float64(9), int64(2), object(2)
memory usage: 605.4+ KB


# Checking for duplicate values
data.duplicated().sum()

0


# Checking for any missing values just in case
data.isnull().sum()

BAD           0
LOAN          0
MORTDUE     518
VALUE       112
REASON      252
JOB         279
YOJ         515
DEROG       708
DELINQ      580
CLAGE       308
NINQ        510
CLNO        222
DEBTINC    1267
dtype: int64


# Making a list of all categorical variables
cat_col = list(data.select_dtypes("object").columns)
print(cat_col)

['REASON', 'JOB']


# Making a list of all categorical variables
num_col = list(data.select_dtypes(include=["int64", "float64"]).columns)
print(num_col)

['BAD', 'LOAN', 'MORTDUE', 'VALUE', 'YOJ', 'DEROG', 'DELINQ', 'CLAGE', 'NINQ', 'CLNO', 'DEBTINC']


data[num_col].describe().T


# Creating histograms
data[num_col].hist(figsize = (14, 14))

plt.show()


plt.figure(figsize = (15, 5))
plt.rcParams.update({'font.size': 12})
a = 10
col = 'LOAN'   
data[col].dropna().hist(bins=a, grid=False)
plt.ylim(0, np.max(np.histogram(data[col].dropna(), bins=a)[0]) + 50)
plt.xticks([])
# Add to the maximum y axis tick
ticks = plt.yticks()[0]  
plt.yticks(ticks)  
total = float(len(data[col].dropna()))
plt.text(data[col].dropna().max(), 0, str('{:.1f}'.format(data[col].dropna().max())), ha='center', va='top', color = 'black', rotation=90) 

# add data labels to the histogram
for i, v in enumerate(np.histogram(data[col].dropna(), bins=a)[0]):
    plt.text((data[col].dropna().min() + (i + 0.5) * (data[col].dropna().max() - data[col].dropna().min()) / 10), v + 5, str(v), ha='center', va='bottom', color = 'black')

# add percentage labels 
for i, v in enumerate(np.histogram(data[col].dropna(), bins=a)[0]):
    plt.text((data[col].dropna().min() + (i + 0.5) * (data[col].dropna().max() - data[col].dropna().min()) / 10), v + 115, str('{:.1f}%'.format(100 * v / total)), ha='center', va='bottom', color = 'blue')

# add percentage labels 
for i, v in enumerate(np.histogram(data[col].dropna(), bins=a)[0]):
    plt.text((data[col].dropna().min() + (i) * (data[col].dropna().max() - data[col].dropna().min()) / 10), 0, str('{:.1f}'.format(data[col].dropna().min() + (i ) * (data[col].dropna().max() - data[col].dropna().min()) / 10)), ha='center', va='top', color = 'black', rotation=90)

plt.ylim(0, plt.ylim()[1])       
plt.ylabel('count')
plt.show()


plt.figure(figsize = (15, 5))
plt.rcParams.update({'font.size': 12})
a = 10
col = 'YOJ'   
data[col].dropna().hist(bins=a, grid=False)
plt.ylim(0, np.max(np.histogram(data[col].dropna(), bins=a)[0]) + 100)
plt.xticks([])
# Add to the maximum y axis tick
ticks = plt.yticks()[0]  
plt.yticks(ticks)  
total = float(len(data[col].dropna()))
plt.text(data[col].dropna().max(), 0, str('{:.1f}'.format(data[col].dropna().max())), ha='center', va='top', color = 'black', rotation=90) 

# add data labels to the histogram
for i, v in enumerate(np.histogram(data[col].dropna(), bins=a)[0]):
    plt.text((data[col].dropna().min() + (i + 0.5) * (data[col].dropna().max() - data[col].dropna().min()) / 10), v + 5, str(v), ha='center', va='bottom', color = 'black')

# add percentage labels 
for i, v in enumerate(np.histogram(data[col].dropna(), bins=a)[0]):
    plt.text((data[col].dropna().min() + (i + 0.5) * (data[col].dropna().max() - data[col].dropna().min()) / 10), v + 115, str('{:.1f}%'.format(100 * v / total)), ha='center', va='bottom', color = 'blue')

# add percentage labels 
for i, v in enumerate(np.histogram(data[col].dropna(), bins=a)[0]):
    plt.text((data[col].dropna().min() + (i) * (data[col].dropna().max() - data[col].dropna().min()) / 10), 0, str('{:.1f}'.format(data[col].dropna().min() + (i ) * (data[col].dropna().max() - data[col].dropna().min()) / 10)), ha='center', va='top', color = 'black', rotation=90)

plt.ylim(0, plt.ylim()[1]+100)       
plt.ylabel('count')
plt.show()


for col in cat_col:
    print(data[col].value_counts())
    print(' ')

DebtCon    3928
HomeImp    1780
Name: REASON, dtype: int64
 
Other      2388
ProfExe    1276
Office      948
Mgr         767
Self        193
Sales       109
Name: JOB, dtype: int64


plt.figure(figsize = (10, 6))

ax = sns.countplot(x = 'REASON', hue = 'BAD', data = data)
i = 1
# Annotating the exact count on the top of the bar for each category 
for p in ax.patches:
  
    ax.annotate(p.get_height(), (p.get_x(), p.get_height()+ 0.35))
    i = i +1
grouped_B = data.groupby(['REASON', 'BAD']).size()
grouped_R = data.groupby(['REASON']).size()
percent = grouped_B/grouped_R
percent = percent.reset_index()
percent = percent.sort_values(by='REASON', ascending=False)
print(percent)
    
plt.show()

    REASON  BAD         0
2  HomeImp    0  0.777528
3  HomeImp    1  0.222472
0  DebtCon    0  0.810336
1  DebtCon    1  0.189664


# create a contingency table of REASON and BAD variables
contingency_table = pd.crosstab(data['REASON'], data['BAD'])

# perform the chi-squared test
chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)

# print the results
print('Chi-Squared Statistic:', chi2)
print('P-Value:', p_value)

Chi-Squared Statistic: 8.039751291499368
P-Value: 0.004576181950707232


# create a strip plot of loan amount by default status
sns.stripplot(y='LOAN', x='BAD', data=data)

# calculate the mean loan amount for bad loans
mean_bad_loans = data[data['BAD'] == 1]['LOAN'].mean()

# calculate the mean loan amount for good loans
mean_good_loans = data[data['BAD'] == 0]['LOAN'].mean()

# add horizontal lines at the locations of the means
plt.axhline(mean_bad_loans, color='orange', linestyle='--')
plt.axhline(mean_good_loans, color='blue', linestyle='--')

# display the plot
plt.show()


plt.figure(figsize = (8, 10))

sns.boxplot(y='LOAN', x='BAD', data=data)

plt.show()


# calculate the mean loan amount for each group
bad_loans = data[data['BAD'] == 1]['LOAN'].mean()
good_loans = data[data['BAD'] == 0]['LOAN'].mean()

# perform a t-test
t_statistic, p_value = stats.ttest_ind(data[data['BAD'] == 1]['LOAN'], data[data['BAD'] == 0]['LOAN'], equal_var=False)

# print the results
print('Mean loan amount for bad loans:', bad_loans)
print('Mean loan amount for good loans:', good_loans)
print('T-Statistic:', t_statistic)
print('P-Value:', p_value)

Mean loan amount for bad loans: 16922.11942809083
Mean loan amount for good loans: 19028.107315028297
T-Statistic: -5.720042746865789
P-Value: 1.2455336996284883e-08


# remove infinite and missing values from VALUE and BAD variables
value = data['VALUE'][np.isfinite(data['VALUE'])].dropna()
bad = data['BAD'][np.isfinite(data['VALUE'])].dropna()

# calculate the Pearson correlation coefficient between VALUE and BAD
corr_coef, p_value = stats.pearsonr(value, bad)

# print the results
print('Pearson correlation coefficient:', "{:.2%}".format(corr_coef))

Pearson correlation coefficient: -3.00%


# create a strip plot of loan amount by default status
sns.stripplot(y='MORTDUE', x='BAD', data=data)

# calculate the mean loan amount for bad loans
mean_bad_loans = data[data['BAD'] == 1]['MORTDUE'].mean()

# calculate the mean loan amount for good loans
mean_good_loans = data[data['BAD'] == 0]['MORTDUE'].mean()

# add horizontal lines at the locations of the means
plt.axhline(mean_bad_loans, color='orange', linestyle='--')
plt.axhline(mean_good_loans, color='blue', linestyle='--')

# display the plot
plt.show()


# calculate the mean mortgage amount for each group
bad_mortgage = data[data['BAD'] == 1]['MORTDUE'].mean()
good_mortgage = data[data['BAD'] == 0]['MORTDUE'].mean()

# perform a t-test
t_statistic, p_value = stats.ttest_ind(data[data['BAD'] == 1]['MORTDUE'].dropna(), data[data['BAD'] == 0]['MORTDUE'].dropna(), equal_var=False)

# print the results
print('Mean mortgage amount for bad loans:', bad_mortgage)
print('Mean mortgage amount for good loans:', good_mortgage)
print('T-Statistic:', t_statistic)
print('P-Value:', p_value)

Mean mortgage amount for bad loans: 69460.45297322252
Mean mortgage amount for good loans: 74829.2490548291
T-Statistic: -3.377418509434846
P-Value: 0.000749659556451092


# create a pivot table of JOB and BAD variables
job_bad_table = pd.crosstab(data['JOB'], data['BAD'])

# chi - test
chi2, p_value, dof, expected = stats.chi2_contingency(job_bad_table)

# display the results
print('Chi-Squared Statistic:', chi2)
print('P-Value:', p_value)

Chi-Squared Statistic: 81.93248953692773
P-Value: 3.306676232858524e-16


# create a pivot table of JOB and BAD variables
pivot_table = pd.pivot_table(data, values='BAD', index='JOB', aggfunc=lambda x: np.mean(x)*100)

# sort the pivot table by the BAD column in descending order
pivot_table = pivot_table.sort_values(by='BAD', ascending=False)

# rename the columns
pivot_table.columns = ['% of Bad Loans']

# define a function to highlight cells with value greater than 30%
def highlight_greater_than_30(val):
    color = 'red' if val > 30 else 'black'
    return 'color: %s' % color

# apply the function to the pivot table
styled_table = pivot_table.style.applymap(highlight_greater_than_30)

# display the styled table
styled_table


plt.figure(figsize = (10, 6))

sns.countplot(x = 'JOB', hue = 'BAD', data = data)

plt.show()


# create a pivot table of DELINQ and BAD variables
pivot_table = pd.pivot_table(data, values='BAD', index='DELINQ', aggfunc=lambda x: np.mean(x)*100)

# sort the pivot table by the DELINQ column in ascending order
pivot_table = pivot_table.sort_values(by='DELINQ', ascending=True)

# rename the columns
pivot_table.columns = ['% of Bad Loans']

# plot the line graph
plt.plot(pivot_table.index, pivot_table['% of Bad Loans'])
plt.title('Percentage of Bad Loans by number of delinquent credit lines')
plt.xlabel('DELINQ')
plt.ylabel('% of Bad Loans')
plt.show()


# create a pivot table of DELINQ and BAD variables
pivot_table = pd.pivot_table(data, values='BAD', index='DEROG', aggfunc=lambda x: np.mean(x)*100)

# sort the pivot table by the DELINQ column in ascending order
pivot_table = pivot_table.sort_values(by='DEROG', ascending=True)

# rename the columns
pivot_table.columns = ['% of Bad Loans']

# plot the line graph
plt.plot(pivot_table.index, pivot_table['% of Bad Loans'])
plt.title('Percentage of Bad Loans by Number of major derogatory reports')
plt.xlabel('DEROG')
plt.ylabel('% of Bad Loans')
plt.show()


plt.figure(figsize = (8, 10))

sns.boxplot(y='CLAGE', x='BAD', data=data)

plt.show()


# calculate the mean mortgage amount for each group
bad_loan = data[data['BAD'] == 1]['CLAGE'].mean()
good_loan = data[data['BAD'] == 0]['CLAGE'].mean()

# perform a t-test
t_statistic, p_value = stats.ttest_ind(data[data['BAD'] == 1]['CLAGE'].dropna(), data[data['BAD'] == 0]['CLAGE'].dropna(), equal_var=False)

# print the results
print('Mean age of the oldest credit line for bad loans: \033[1m{}\033[0m'.format(round(bad_loan)), '\033[1mmonths\033[0m')
print('Mean age of the oldest credit line for good loans: \033[1m{}\033[0m'.format(round(good_loan)), '\033[1mmonths\033[0m')
print('T-Statistic: \033[1m{}\033[0m'.format(t_statistic))
print('P-Value: \033[1m{}\033[0m'.format(p_value))

Mean age of the oldest credit line for bad loans: 150 months
Mean age of the oldest credit line for good loans: 187 months
T-Statistic: -12.960945049977436
P-Value: 1.0785360863493084e-36


# calculate the mean years of job for each group
bad_loan = data[data['BAD'] == 1]['YOJ'].mean()
good_loan = data[data['BAD'] == 0]['YOJ'].mean()

# perform a t-test
t_statistic, p_value = stats.ttest_ind(data[data['BAD'] == 1]['YOJ'].dropna(), data[data['BAD'] == 0]['YOJ'].dropna(), equal_var=False)

# print the results
print('Mean years of job for bad loans: \033[1m{}\033[0m'.format(round(bad_loan,1)), '\033[1yyears\033[0m')
print('Mean years of job for good loans: \033[1m{}\033[0m'.format(round(good_loan,1)), '\033[1yyears\033[0m')
print('T-Statistic: \033[1m{}\033[0m'.format(t_statistic))
print('P-Value: \033[1m{}\033[0m'.format(p_value))

Mean years of job for bad loans: 8.0 years
Mean years of job for good loans: 9.2 years
T-Statistic: -4.6603785260375625
P-Value: 3.3798921146392507e-06


plt.figure(figsize = (8, 10))

sns.boxplot(y='YOJ', x='BAD', data=data)

plt.show()


plt.figure(figsize = (8, 10))

sns.boxplot(y='DEBTINC', x='BAD', data=data)

plt.show()


# calculate the mean mortgage amount for each group
bad_loan = data[data['BAD'] == 1]['DEBTINC'].mean()
good_loan = data[data['BAD'] == 0]['DEBTINC'].mean()

# perform a t-test
t_statistic, p_value = stats.ttest_ind(data[data['BAD'] == 1]['DEBTINC'].dropna(), data[data['BAD'] == 0]['DEBTINC'].dropna(), equal_var=False)

# print the results
print('Mean debt-to-income ratio for bad loans:', bad_loan)
print('Mean debt-to-income ratio for good loans:', good_loan)
print('T-Statistic:', t_statistic)
print('P-Value:', p_value)

Mean debt-to-income ratio for bad loans: 39.387644892291064
Mean debt-to-income ratio for good loans: 33.25312863402213
T-Statistic: 6.898731654644972
P-Value: 1.976105933172547e-11


plt.figure(figsize = (15, 10))

sns.heatmap(data.corr(), annot = True, fmt = '0.2f', square=True)

plt.show()


data_L = data.copy()

# Creating duplicates of columns 
for col in data_L.columns:
    data_L[col + '_L'] = data_L[col]

# marking missing values
for col in data_L.columns:
    if data_L[col].isnull().any():
        data_L[col + '_L'] = data_L[col + '_L'].isnull().astype(int)

# Displaying the first 5 rows of the dataframe
print(data_L.head())

   BAD  LOAN  MORTDUE     VALUE   REASON     JOB   YOJ  DEROG  DELINQ  \
0    1  1100  25860.0   39025.0  HomeImp   Other  10.5    0.0     0.0   
1    1  1300  70053.0   68400.0  HomeImp   Other   7.0    0.0     2.0   
2    1  1500  13500.0   16700.0  HomeImp   Other   4.0    0.0     0.0   
3    1  1500      NaN       NaN      NaN     NaN   NaN    NaN     NaN   
4    0  1700  97800.0  112000.0  HomeImp  Office   3.0    0.0     0.0   

        CLAGE  ...  VALUE_L  REASON_L  JOB_L  YOJ_L  DEROG_L  DELINQ_L  \
0   94.366667  ...        0         0      0      0        0         0   
1  121.833333  ...        0         0      0      0        0         0   
2  149.466667  ...        0         0      0      0        0         0   
3         NaN  ...        1         1      1      1        1         1   
4   93.333333  ...        0         0      0      0        0         0   

   CLAGE_L  NINQ_L  CLNO_L  DEBTINC_L  
0        0       0       0          1  
1        0       0       0          1  
2        0       0       0          1  
3        1       1       1          1  
4        0       0       0          1  

[5 rows x 26 columns]


dataM = data.copy()


# Check for missing values before applying any imputation techniques.
print(dataM.isna().sum())

BAD           0
LOAN          0
MORTDUE     518
VALUE       112
REASON      252
JOB         279
YOJ         515
DEROG       708
DELINQ      580
CLAGE       308
NINQ        510
CLNO        222
DEBTINC    1267
dtype: int64


# Creating a KNNImputer object
imputer = KNNImputer(n_neighbors=5)

# Selecting features for imputing missing values
features = [ 'MORTDUE', 'VALUE']

# Filling missing values using KNNImputer
dataM[features] = imputer.fit_transform(dataM[features])

# Printing missing value information after imputation
print(dataM.isna().sum())

BAD           0
LOAN          0
MORTDUE       0
VALUE         0
REASON      252
JOB         279
YOJ         515
DEROG       708
DELINQ      580
CLAGE       308
NINQ        510
CLNO        222
DEBTINC    1267
dtype: int64


# fill missing values with median, taking into account the BAD column
for col in num_col:
    for value in dataM['BAD'].unique():
        median_val = dataM[dataM['BAD'] == value][col].median()
        dataM.loc[(dataM['BAD'] == value) & (dataM[col].isnull()), col] = median_val


# create a function to fill missing values with mode, taking into account the BAD column
def fillna_mode_with_bad(data, cat_col):
    for col in cat_col:
        # определяем моду
        mode = dataM[dataM['BAD'] == 0][col].mode()[0]
        # заполняем пропущенные значения для BAD = 0 модой
        dataM.loc[(dataM[col].isnull()) & (dataM['BAD'] == 0), col] = mode
        # определяем моду для BAD = 1
        mode = dataM[dataM['BAD'] == 1][col].mode()[0]
        # заполняем пропущенные значения для BAD = 1 модой
        dataM.loc[(dataM[col].isnull()) & (dataM['BAD'] == 1), col] = mode
    return data

# заполняем пропущенные значения модой с учетом BAD для столбцов из cat_col
dataM = fillna_mode_with_bad(dataM, cat_col)


dataM.isnull().sum()

BAD        0
LOAN       0
MORTDUE    0
VALUE      0
REASON     0
JOB        0
YOJ        0
DEROG      0
DELINQ     0
CLAGE      0
NINQ       0
CLNO       0
DEBTINC    0
dtype: int64


dataLGB = dataM.copy()


dataLGB.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5960 entries, 0 to 5959
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   BAD      5960 non-null   int64  
 1   LOAN     5960 non-null   int64  
 2   MORTDUE  5960 non-null   float64
 3   VALUE    5960 non-null   float64
 4   REASON   5960 non-null   object 
 5   JOB      5960 non-null   object 
 6   YOJ      5960 non-null   float64
 7   DEROG    5960 non-null   float64
 8   DELINQ   5960 non-null   float64
 9   CLAGE    5960 non-null   float64
 10  NINQ     5960 non-null   float64
 11  CLNO     5960 non-null   float64
 12  DEBTINC  5960 non-null   float64
dtypes: float64(9), int64(2), object(2)
memory usage: 605.4+ KB


job_mappingJ = {'Self': 0, 'ProfExe': 1, 'Other': 2, 'Mgr': 3, 'Office': 4, 'Sales': 5}
dataLGB['JOB'] = dataLGB['JOB'].map(job_mappingJ)


job_mappingR = {'DebtCon': 0, 'HomeImp': 1}
dataLGB['REASON'] = dataLGB['REASON'].map(job_mappingR)


dataLGB.head()


plt.figure(figsize = (15, 10))

sns.heatmap(dataLGB.corr(), annot = True, fmt = '0.2f', square=True)

plt.show()


# ПConverting categorical columns using the get_dummies function
encoded_data = pd.get_dummies(dataM[cat_col], prefix=cat_col, drop_first=True)

# Replacing categorical columns with dummy variables columns
dataM = pd.concat([dataM.drop(cat_col, axis=1), encoded_data], axis=1)


dataM.head()


# Creating metric function

def metrics_score(actual, predicted):
    
    print(classification_report(actual, predicted))
    
    cm = confusion_matrix(actual, predicted)
    
    plt.figure(figsize = (8, 5))
    
    sns.heatmap(cm, annot = True, fmt = '.2f', xticklabels = ['NOT DEFAULT', 'DEFAULT'], yticklabels = ['NOT DEFAULT', 'DEFAULT'])
    plt.ylabel('Actual')
    
    plt.xlabel('Predicted')
    
    plt.show()
    
def model_performance_classification(model, predictors, target):
    """
    Function to compute different metrics to check classification model performance

    model: classifier
    
    predictors: independent variables
    
    target: dependent variable
    """

    # Predicting using the independent variables
    pred = model.predict(predictors)

    recall = recall_score(target, pred,average = 'macro')                 # To compute recall
    
    precision = precision_score(target, pred, average = 'macro')              # To compute precision
               
    acc = accuracy_score(target, pred)                                 # To compute accuracy score
    

    # Creating a dataframe of metrics
    
    df_perf = pd.DataFrame(
        {
            "Precision":  precision,
            "Recall":  recall,
            "Accuracy": acc,
        },
        
        index = [0],
    )

    return df_perf


dataR = dataM.copy()


Y = dataR.BAD

X = dataR.drop(['BAD'], axis = 1)

sc = MinMaxScaler()

X_scaled = sc.fit_transform(X)

X_scaled = pd.DataFrame(X_scaled, columns = X.columns)

# Splitting the data
x_train, x_test, y_train, y_test = train_test_split(X_scaled, Y, test_size = 0.3, random_state = 1, stratify = Y)


print(Y.value_counts())

0    4771
1    1189
Name: BAD, dtype: int64


# Fitting the logistic regression model
lg = LogisticRegression()

lg.fit(x_train,y_train)

LogisticRegression()

LogisticRegression()


# Checking the performance on the training data
y_pred_trainLg = lg.predict(x_train)

metrics_score(y_train, y_pred_trainLg)

              precision    recall  f1-score   support

           0       0.84      0.99      0.91      3340
           1       0.81      0.26      0.39       832

    accuracy                           0.84      4172
   macro avg       0.83      0.62      0.65      4172
weighted avg       0.84      0.84      0.81      4172


# Checking the performance on the test dataset
y_pred_testLg = lg.predict(x_test)
metrics_score(y_test, y_pred_testLg)

              precision    recall  f1-score   support

           0       0.84      0.98      0.90      1431
           1       0.76      0.23      0.35       357

    accuracy                           0.83      1788
   macro avg       0.80      0.60      0.63      1788
weighted avg       0.82      0.83      0.79      1788


# Printing the coefficients of logistic regression
cols = X.columns

coef_lg = lg.coef_

pd.DataFrame(coef_lg,columns = cols).T.sort_values(by = 0, ascending = False)


odds = np.exp(lg.coef_[0]) # Finding the odds

# Adding the odds to a DataFrame and sorting the values
pd.DataFrame(odds, x_train.columns, columns = ['odds']).sort_values(by = 'odds', ascending = False)


y_scores_lg = lg.predict_proba(x_train) # predict_proba gives the probability of each observation belonging to each class


precisions_lg, recalls_lg, thresholds_lg = precision_recall_curve(y_train, y_scores_lg[:, 1])

# Plot values of precisions, recalls, and thresholds
plt.figure(figsize = (10, 7))

plt.plot(thresholds_lg, precisions_lg[:-1], 'b--', label = 'precision')

plt.plot(thresholds_lg, recalls_lg[:-1], 'g--', label = 'recall')

plt.xlabel('Threshold')

plt.legend(loc = 'upper left')

plt.ylim([0, 1])

plt.show()


optimal_threshold1 = .23

y_pred_train_LgOT = lg.predict_proba(x_train)

metrics_score(y_train, y_pred_train_LgOT[:, 1] > optimal_threshold1)

              precision    recall  f1-score   support

           0       0.91      0.84      0.87      3340
           1       0.51      0.65      0.57       832

    accuracy                           0.81      4172
   macro avg       0.71      0.75      0.72      4172
weighted avg       0.83      0.81      0.81      4172


optimal_threshold1 = .23

y_pred_test_LgOT = lg.predict_proba(x_test)

metrics_score(y_test, y_pred_test_LgOT[:, 1] > optimal_threshold1)

              precision    recall  f1-score   support

           0       0.90      0.86      0.88      1431
           1       0.52      0.60      0.56       357

    accuracy                           0.81      1788
   macro avg       0.71      0.73      0.72      1788
weighted avg       0.82      0.81      0.81      1788


# Define the parameter grid to search over
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'class_weight': ['balanced', None]
}

# Create a GridSearchCV object with the logistic regression model, parameter grid, and cross-validation
grid_search = GridSearchCV(lg, param_grid, cv=10)

# Fit the GridSearchCV object on the resampled training data
grid_search.fit(x_train, y_train)

# Print the best parameters found by GridSearchCV
print("Best parameters: ", grid_search.best_params_)

Best parameters:  {'C': 0.001, 'class_weight': None, 'penalty': 'none', 'solver': 'newton-cg'}


# Create a logistic regression model with the best hyperparameters
logreg = LogisticRegression(C=0.1, class_weight='balanced', penalty='l2', solver='saga')

# Train the model on the resampled training data
logreg.fit(x_train, y_train)

LogisticRegression(C=0.1, class_weight='balanced', solver='saga')

LogisticRegression(C=0.1, class_weight='balanced', solver='saga')


optimal_threshold = .21
y_pred_train_LgGS = (y_train > optimal_threshold).astype(int)
metrics_score(y_train, y_pred_train_LgGS)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3340
           1       1.00      1.00      1.00       832

    accuracy                           1.00      4172
   macro avg       1.00      1.00      1.00      4172
weighted avg       1.00      1.00      1.00      4172


# Classify the test set based on the optimal threshold obtained earlier
y_pred_test_LgGS = (y_test > optimal_threshold).astype(int)
metrics_score(y_test, y_pred_test_LgGS)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1431
           1       1.00      1.00      1.00       357

    accuracy                           1.00      1788
   macro avg       1.00      1.00      1.00      1788
weighted avg       1.00      1.00      1.00      1788


dataI = dataM.copy()


plt.figure(figsize = (12, 7))

sns.heatmap(dataI.corr(), annot = True, fmt = '.2f')

plt.show()


# Separating the target variable and other variables

Y = dataI.BAD

X = dataI.drop(['BAD'], axis = 1)

# Splitting the data
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 1, stratify = Y)


# Fitting the decision tree classifier on the training data
d_tree = DecisionTreeClassifier(class_weight = {0: 0.2, 1: 0.8}, random_state = 42)

d_tree.fit(x_train, y_train)
# Checking performance on the training data
y_pred_train_DT = d_tree.predict(x_train)
metrics_score(y_train, y_pred_train_DT)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3340
           1       1.00      1.00      1.00       832

    accuracy                           1.00      4172
   macro avg       1.00      1.00      1.00      4172
weighted avg       1.00      1.00      1.00      4172


# Checking performance on the testing data
y_pred_test_DT = d_tree.predict(x_test)
metrics_score(y_test, y_pred_test_DT)

              precision    recall  f1-score   support

           0       0.96      0.97      0.97      1431
           1       0.89      0.84      0.87       357

    accuracy                           0.95      1788
   macro avg       0.93      0.91      0.92      1788
weighted avg       0.95      0.95      0.95      1788


# Model Performance on the test data
d_tree_test = model_performance_classification(d_tree,x_test,y_test)

d_tree_test


# Plotting the feature importance
importances = d_tree.feature_importances_

indices = np.argsort(importances)

plt.figure(figsize = (10, 10))

plt.title('Feature Importances')

plt.barh(range(len(indices)), importances[indices], color = 'violet', align = 'center')

plt.yticks(range(len(indices)), [features[i] for i in indices])

plt.xlabel('Relative Importance')

plt.show()


# Choose the type of classifier 
d_tree_tuned = DecisionTreeClassifier(random_state = 42, class_weight = {0: 0.2, 1: 0.8})

# Grid of parameters to choose from
parameters = {'max_depth': np.arange(2, 10), 
              'criterion': ['gini', 'entropy'],
              'min_samples_leaf': [5, 10, 20, 25]
             }

# Type of scoring used to compare parameter combinations - recall score for class 1
scorer = metrics.make_scorer(recall_score, pos_label = 1)

# Run the grid search
grid_obj = GridSearchCV(d_tree_tuned, parameters, scoring = scorer, cv = 5)

grid_obj = grid_obj.fit(x_train, y_train)

# Set the classifier to the best combination of parameters
d_tree_tuned = grid_obj.best_estimator_

# Fit the best algorithm to the data
d_tree_tuned.fit(x_train, y_train)

DecisionTreeClassifier(class_weight={0: 0.2, 1: 0.8}, max_depth=9,
                       min_samples_leaf=10, random_state=42)

DecisionTreeClassifier(class_weight={0: 0.2, 1: 0.8}, max_depth=9,
                       min_samples_leaf=10, random_state=42)


# Checking performance on the training data
y_pred_train_DTGS = d_tree_tuned.predict(x_train)

metrics_score(y_train, y_pred_train_DTGS)

              precision    recall  f1-score   support

           0       0.98      0.95      0.97      3340
           1       0.82      0.93      0.87       832

    accuracy                           0.95      4172
   macro avg       0.90      0.94      0.92      4172
weighted avg       0.95      0.95      0.95      4172


# Checking performance on the testing data
y_pred_test_DTGS = d_tree_tuned.predict(x_test)

metrics_score(y_test, y_pred_test_DTGS)

              precision    recall  f1-score   support

           0       0.96      0.93      0.94      1431
           1       0.74      0.84      0.79       357

    accuracy                           0.91      1788
   macro avg       0.85      0.88      0.87      1788
weighted avg       0.92      0.91      0.91      1788


# Model Performance on the test data
d_tree_tuned_test = model_performance_classification(d_tree_tuned,x_test,y_test)

d_tree_tuned_test


features = list(X.columns)

plt.figure(figsize = (20, 20))

tree.plot_tree(d_tree_tuned, feature_names = features, filled = True, fontsize = 9, node_ids = True, class_names = True)

plt.show()


# Importance of features in the tree building

print (pd.DataFrame(d_tree_tuned.feature_importances_, columns = ["Imp"], index = x_train.columns).sort_values(by = 'Imp', ascending = False))

                     Imp
DEBTINC         0.746040
CLAGE           0.065819
DELINQ          0.034500
MORTDUE         0.034019
VALUE           0.031408
LOAN            0.029151
CLNO            0.019760
YOJ             0.012465
NINQ            0.011319
DEROG           0.010678
JOB_ProfExe     0.002934
JOB_Sales       0.001612
JOB_Other       0.000294
REASON_HomeImp  0.000000
JOB_Office      0.000000
JOB_Self        0.000000


# Plotting the feature importance
importances = d_tree_tuned.feature_importances_

indices = np.argsort(importances)

plt.figure(figsize = (10, 10))

plt.title('Feature Importances')

plt.barh(range(len(indices)), importances[indices], color = 'violet', align = 'center')

plt.yticks(range(len(indices)), [features[i] for i in indices])

plt.xlabel('Relative Importance')

plt.show()


# Fitting the random forest tree classifier on the training data
rf_estimator = RandomForestClassifier(class_weight = {0: 0.20, 1: 0.80}, random_state = 42)

rf_estimator.fit(x_train, y_train)

RandomForestClassifier(class_weight={0: 0.2, 1: 0.8}, random_state=42)

RandomForestClassifier(class_weight={0: 0.2, 1: 0.8}, random_state=42)


# Checking performance on the training data
y_pred_train_RF = rf_estimator.predict(x_train)

metrics_score(y_train,y_pred_train_RF)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3340
           1       1.00      1.00      1.00       832

    accuracy                           1.00      4172
   macro avg       1.00      1.00      1.00      4172
weighted avg       1.00      1.00      1.00      4172


# Checking performance on the testing data
y_pred_test_RF = rf_estimator.predict(x_test)

metrics_score(y_test,y_pred_test_RF)

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1431
           1       0.99      0.80      0.88       357

    accuracy                           0.96      1788
   macro avg       0.97      0.90      0.93      1788
weighted avg       0.96      0.96      0.96      1788


# Model Performance on the test data
rf_estimator_test = model_performance_classification(rf_estimator,x_test,y_test)

rf_estimator_test


# Choose the type of classifier
rf_estimator_tuned = RandomForestClassifier(criterion = "entropy", random_state = 42)

# Grid of parameters to choose from
parameters = {"n_estimators": [100, 110, 120],
    "max_depth": [5, 6, 7],
    "max_features": [0.8, 0.9, 1]
             }

# Type of scoring used to compare parameter combinations - recall score for class 1
scorer = metrics.make_scorer(recall_score, pos_label = 1)

# Run the grid search
grid_obj = GridSearchCV(rf_estimator_tuned, parameters, scoring = scorer, cv = 5)

grid_obj = grid_obj.fit(x_train, y_train)

# Set the classifier to the best combination of parameters
rf_estimator_tuned = grid_obj.best_estimator_


# Fitting the best algorithm to the training data
rf_estimator_tuned.fit(x_train, y_train)

RandomForestClassifier(criterion='entropy', max_depth=7, max_features=0.8,
                       random_state=42)

RandomForestClassifier(criterion='entropy', max_depth=7, max_features=0.8,
                       random_state=42)


# Checking performance on the training data
y_pred_train_RFGS = rf_estimator_tuned.predict(x_train)

metrics_score(y_train, y_pred_train_RFGS)

              precision    recall  f1-score   support

           0       0.95      1.00      0.98      3340
           1       1.00      0.80      0.89       832

    accuracy                           0.96      4172
   macro avg       0.98      0.90      0.93      4172
weighted avg       0.96      0.96      0.96      4172


# Checking performance on the testing data
y_pred_test_RFGS = rf_estimator_tuned.predict(x_test)

metrics_score(y_test,y_pred_test_RFGS)

              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1431
           1       1.00      0.75      0.85       357

    accuracy                           0.95      1788
   macro avg       0.97      0.87      0.91      1788
weighted avg       0.95      0.95      0.95      1788


# Model Performance on the test data
rf_estimator_tuned_test = model_performance_classification(rf_estimator_tuned,x_test,y_test)

rf_estimator_tuned_test


# Choose the type of classifier 
rf_estimator_tuned = RandomForestClassifier(criterion = "entropy", random_state = 42)

# Grid of parameters to choose from
parameters = {"n_estimators": [110, 120],
    "max_depth": [6, 7],
    "min_samples_leaf": [20, 25],
    "max_features": [0.8, 0.9],
    "max_samples": [0.9, 1],
    "class_weight": ["balanced",{0: 0.3, 1: 0.7}]
             }

# Type of scoring used to compare parameter combinations - recall score for class 1
scorer = metrics.make_scorer(recall_score, pos_label = 1)

# Run the grid search on the training data using scorer=scorer and cv=5
grid_obj = GridSearchCV(rf_estimator_tuned, parameters, scoring = scorer, cv = 5)

grid_obj = grid_obj.fit(x_train, y_train)

# Save the best estimator to variable rf_estimator_tuned
rf_estimator_tuned_2 = grid_obj.fit(x_train, y_train)

#Fit the best estimator to the training data'
rf_estimator_tuned_2 = grid_obj.best_estimator_


# Checking performance on the training data
y_pred_train_RFGS_2 = rf_estimator_tuned_2.predict(x_train)

metrics_score(y_train, y_pred_train_RFGS_2)

              precision    recall  f1-score   support

           0       0.96      0.99      0.97      3340
           1       0.94      0.82      0.88       832

    accuracy                           0.95      4172
   macro avg       0.95      0.90      0.92      4172
weighted avg       0.95      0.95      0.95      4172


# Checking performance on the test data
y_pred_test_RFGS_2 = rf_estimator_tuned_2.predict(x_test)

metrics_score(y_test, y_pred_test_RFGS_2)

              precision    recall  f1-score   support

           0       0.94      0.98      0.96      1431
           1       0.90      0.74      0.81       357

    accuracy                           0.93      1788
   macro avg       0.92      0.86      0.89      1788
weighted avg       0.93      0.93      0.93      1788


# Model Performance on the test data
rf_estimator_tuned_2_test = model_performance_classification(rf_estimator_tuned_2,x_test,y_test)

rf_estimator_tuned_2_test


importances = rf_estimator_tuned_2.feature_importances_

indices = np.argsort(importances)

feature_names = list(X.columns)

plt.figure(figsize = (12, 12))

plt.title('Feature Importances')

plt.barh(range(len(indices)), importances[indices], color = 'violet', align = 'center')

plt.yticks(range(len(indices)), [feature_names[i] for i in indices])

plt.xlabel('Relative Importance')

plt.show()


# Create CatBoost classifier
catboost = CatBoostClassifier(random_state=42, verbose=0)

# Fit the model
catboost.fit(x_train, y_train)

# Make predictions on train set
y_pred_train_CB = catboost.predict(x_train)

# Compute metrics
metrics_score(y_train, y_pred_train_CB)

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3340
           1       1.00      0.92      0.96       832

    accuracy                           0.98      4172
   macro avg       0.99      0.96      0.97      4172
weighted avg       0.98      0.98      0.98      4172


# Make predictions on train set
y_pred_test_CB = catboost.predict(x_test)

# Compute metrics
metrics_score(y_test, y_pred_test_CB)

              precision    recall  f1-score   support

           0       0.95      1.00      0.98      1431
           1       0.99      0.81      0.89       357

    accuracy                           0.96      1788
   macro avg       0.97      0.90      0.93      1788
weighted avg       0.96      0.96      0.96      1788


# Model Performance on the test data
catboost_test = model_performance_classification(catboost,x_test,y_test)

catboost_test


# Type of scoring used to compare parameter combinations - recall score for class 1
scorer = make_scorer(recall_score, pos_label=1)


kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# CatBoost Classifier
catboost = CatBoostClassifier(random_state=42, verbose=0)

# Grid of parameters to choose from
params_catboost = {
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "iterations": [50, 100, 200, 300, 500],
    "depth": [2, 3, 5, 7, 9],
    "l2_leaf_reg": [1, 3, 5, 7, 9],
    "border_count": [32, 64, 128, 254],
    "bagging_temperature": [0, 0.1, 0.2, 0.3],
}

# Run the randomized search
random_obj = RandomizedSearchCV(catboost, params_catboost, scoring='recall', cv=kfold, n_jobs=-1, n_iter=100, random_state=42)
random_obj = random_obj.fit(x_train, y_train)

# Set the classifier to the best combination of parameters
catboost_tuned = random_obj.best_estimator_

# Fit the model
catboost_tuned.fit(x_train, y_train)

y_pred_train_CBRS = catboost_tuned.predict(x_train)
metrics_score(y_train, y_pred_train_CBRS)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3340
           1       1.00      1.00      1.00       832

    accuracy                           1.00      4172
   macro avg       1.00      1.00      1.00      4172
weighted avg       1.00      1.00      1.00      4172


y_pred_test_CBRS = catboost_tuned.predict(x_test)
# Model Performance on the test data

metrics_score(y_test, y_pred_test_CBRS)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1431
           1       0.99      0.82      0.90       357

    accuracy                           0.96      1788
   macro avg       0.97      0.91      0.94      1788
weighted avg       0.96      0.96      0.96      1788


# Model Performance on the test data
catboost_tuned_test = model_performance_classification(catboost_tuned,x_test,y_test)

catboost_tuned_test


import lightgbm as lgb

# LightGBM Classifier
lgbm = lgb.LGBMClassifier(random_state=42)

# Fit the model
lgbm.fit(x_train, y_train)

# Make predictions on train set
y_pred_train_lgbm = lgbm.predict(x_train)

# Compute metrics
metrics_score(y_train, y_pred_train_lgbm)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3340
           1       1.00      1.00      1.00       832

    accuracy                           1.00      4172
   macro avg       1.00      1.00      1.00      4172
weighted avg       1.00      1.00      1.00      4172


y_pred_test_lgbm = lgbm.predict(x_test)

# Compute metrics
metrics_score(y_test, y_pred_test_lgbm)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1431
           1       1.00      0.82      0.90       357

    accuracy                           0.96      1788
   macro avg       0.98      0.91      0.94      1788
weighted avg       0.97      0.96      0.96      1788


# Model Performance on the test data
lgbm_test = model_performance_classification(lgbm,x_test,y_test)

lgbm_test


# Separating the target variable and other variables

YL = dataLGB.BAD

XL = dataLGB.drop(['BAD'], axis = 1)

# Splitting the data
xl_train, xl_test, yl_train, yl_test = train_test_split(XL, YL, test_size = 0.3, random_state = 1, stratify = Y)


# LightGBM Classifier
lgbmCat = lgb.LGBMClassifier(random_state=42)

# Fit the model
lgbmCat.fit(xl_train, yl_train)

# Make predictions on train set
yl_pred_train_lgbm = lgbmCat.predict(xl_train)

# Compute metrics
metrics_score(yl_train, yl_pred_train_lgbm)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3340
           1       1.00      1.00      1.00       832

    accuracy                           1.00      4172
   macro avg       1.00      1.00      1.00      4172
weighted avg       1.00      1.00      1.00      4172


yl_pred_test_lgbm = lgbmCat.predict(xl_test)

# Compute metrics
metrics_score(yl_test, yl_pred_test_lgbm)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1431
           1       0.99      0.82      0.90       357

    accuracy                           0.96      1788
   macro avg       0.98      0.91      0.94      1788
weighted avg       0.96      0.96      0.96      1788


# Model Performance on the test data
lgbm_test_cat = model_performance_classification(lgbmCat,xl_test,yl_test)

lgbm_test_cat


# LightGBM Classifier
lgbm = lgb.LGBMClassifier(random_state=42)

# Grid of parameters to choose from
params_lgbm = {
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "n_estimators": [50, 100, 200, 300, 500],
    "max_depth": [2, 3, 5, 7, 9],
    "num_leaves": [15, 31, 63, 127],
    "min_child_samples": [10, 20, 30, 50],
    "subsample": [0.5, 0.6, 0.8, 1.0],
    "colsample_bytree": [0.5, 0.6, 0.8, 1.0],
}

# Run the randomized search
random_obj = RandomizedSearchCV(lgbm, params_lgbm, scoring='recall', cv=kfold, n_jobs=-1, n_iter=100, random_state=42)

random_obj = random_obj.fit(x_train, y_train)

# Set the classifier to the best combination of parameters
lgbm_tuned = random_obj.best_estimator_

# Fit the model
lgbm_tuned.fit(x_train, y_train)

y_pred_train_LGBMRS = lgbm_tuned.predict(x_train)
metrics_score(y_train, y_pred_train_LGBMRS)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3340
           1       1.00      1.00      1.00       832

    accuracy                           1.00      4172
   macro avg       1.00      1.00      1.00      4172
weighted avg       1.00      1.00      1.00      4172


y_pred_test_LGBMRS = lgbm_tuned.predict(x_test)
# Model Performance on the test data

metrics_score(y_test, y_pred_test_LGBMRS)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1431
           1       0.99      0.82      0.89       357

    accuracy                           0.96      1788
   macro avg       0.97      0.91      0.94      1788
weighted avg       0.96      0.96      0.96      1788


# Model Performance on the test data
lgbm_tuned_test = model_performance_classification(lgbm_tuned,x_test,y_test)

lgbm_tuned_test


# Importing the XGBClassifier from the xgboost library
from xgboost import XGBClassifier

# XGBoost Classifier
xgb = XGBClassifier(random_state=42, eval_metric='logloss')


# Fit the model
xgb.fit(x_train, y_train)

y_pred_train_XGB = xgb.predict(x_train)
metrics_score(y_train, y_pred_train_XGB)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3340
           1       1.00      1.00      1.00       832

    accuracy                           1.00      4172
   macro avg       1.00      1.00      1.00      4172
weighted avg       1.00      1.00      1.00      4172


y_pred_test_XGB = xgb.predict(x_test)
# Model Performance on the test data
metrics_score(y_test, y_pred_test_XGB)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1431
           1       0.99      0.81      0.89       357

    accuracy                           0.96      1788
   macro avg       0.97      0.91      0.93      1788
weighted avg       0.96      0.96      0.96      1788


# Model Performance on the test data
xgb_test = model_performance_classification(xgb,x_test,y_test)

xgb_test


# undersampling
under_sampler = RandomUnderSampler(random_state=42)
x_train_under, y_train_under = under_sampler.fit_resample(x_train, y_train)

# LightGBM Classifier
lgbm = lgb.LGBMClassifier(random_state=42)

# Grid of parameters to choose from
params_lgbm = {
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "n_estimators": [50, 100, 200, 300, 500],
    "max_depth": [2, 3, 5, 7, 9],
    "num_leaves": [15, 31, 63, 127],
    "min_child_samples": [10, 20, 30, 50],
    "subsample": [0.5, 0.6, 0.8, 1.0],
    "colsample_bytree": [0.5, 0.6, 0.8, 1.0],
}

# Run the randomized search
random_obj = RandomizedSearchCV(lgbm, params_lgbm, scoring='recall', cv=kfold, n_jobs=-1, n_iter=100, random_state=42)

random_obj = random_obj.fit(x_train_under, y_train_under)

# Set the classifier to the best combination of parameters
lgbm_tuned_under = random_obj.best_estimator_

# Fit the model
lgbm_tuned_under.fit(x_train_under, y_train_under)

y_pred_train_LGBMUnder = lgbm_tuned_under.predict(x_train_under)
metrics_score(y_train_under, y_pred_train_LGBMUnder)


# Model Performance on the test data
#lgbm_perf_test = model_performance_classification(lgbm_tuned, x_test, y_test)

#print(lgbm_perf_test)

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       832
           1       1.00      0.98      0.99       832

    accuracy                           0.99      1664
   macro avg       0.99      0.99      0.99      1664
weighted avg       0.99      0.99      0.99      1664


y_pred_test_LGBMUnder = lgbm_tuned_under.predict(x_test)
metrics_score(y_test, y_pred_test_LGBMUnder)

              precision    recall  f1-score   support

           0       0.96      0.94      0.95      1431
           1       0.80      0.86      0.83       357

    accuracy                           0.93      1788
   macro avg       0.88      0.90      0.89      1788
weighted avg       0.93      0.93      0.93      1788


# Model Performance on the test data
lgbm_tuned_under_test = model_performance_classification(lgbm_tuned_under,x_test,y_test)

lgbm_tuned_under_test


kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# CatBoost Classifier
catboost = CatBoostClassifier(random_state=42, verbose=0)

# Grid of parameters to choose from
params_catboost = {
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "iterations": [50, 100, 200, 300, 500],
    "depth": [2, 3, 5, 7, 9],
    "l2_leaf_reg": [1, 3, 5, 7, 9],
    "border_count": [32, 64, 128, 254],
    "bagging_temperature": [0, 0.1, 0.2, 0.3],
}

# Run the randomized search
random_obj = RandomizedSearchCV(catboost, params_catboost, scoring='recall', cv=kfold, n_jobs=-1, n_iter=100, random_state=42)
random_obj = random_obj.fit(x_train_under, y_train_under)

# Set the classifier to the best combination of parameters
catboost_tuned_under = random_obj.best_estimator_

# Fit the model
catboost_tuned_under.fit(x_train_under, y_train_under)

y_pred_train_CBUnder = catboost_tuned_under.predict(x_train_under)
metrics_score(y_train_under, y_pred_train_CBUnder)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       832
           1       1.00      1.00      1.00       832

    accuracy                           1.00      1664
   macro avg       1.00      1.00      1.00      1664
weighted avg       1.00      1.00      1.00      1664


y_pred_test_CBUnder = catboost_tuned_under.predict(x_test)
metrics_score(y_test, y_pred_test_CBUnder)

              precision    recall  f1-score   support

           0       0.96      0.97      0.97      1431
           1       0.87      0.85      0.86       357

    accuracy                           0.95      1788
   macro avg       0.92      0.91      0.91      1788
weighted avg       0.94      0.95      0.94      1788


# Model Performance on the test data
catboost_tuned_under_test = model_performance_classification(catboost_tuned_under,x_test,y_test)

catboost_tuned_under_test


# Oversampling
over_sampler = RandomOverSampler(sampling_strategy=0.8, random_state=42)
x_train_over, y_train_over = over_sampler.fit_resample(x_train, y_train)


# Oversampling
over_sampler = RandomOverSampler(sampling_strategy=0.8, random_state=42)
xl_train_over, yl_train_over = over_sampler.fit_resample(xl_train, yl_train)


kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# CatBoost Classifier
catboost = CatBoostClassifier(random_state=42, verbose=0)

# Grid of parameters to choose from
params_catboost = {
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "iterations": [50, 100, 200, 300, 500],
    "depth": [2, 3, 5, 7, 9],
    "l2_leaf_reg": [1, 3, 5, 7, 9],
    "border_count": [32, 64, 128, 254],
    "bagging_temperature": [0, 0.1, 0.2, 0.3],
}

# Run the randomized search
random_obj = RandomizedSearchCV(catboost, params_catboost, scoring='recall', cv=kfold, n_jobs=-1, n_iter=100, random_state=42)
random_obj = random_obj.fit(x_train_over, y_train_over)

# Set the classifier to the best combination of parameters
catboost_tuned_over = random_obj.best_estimator_

# Fit the model
catboost_tuned_over.fit(x_train_over, y_train_over)



y_pred_train_CBOver = catboost_tuned_over.predict(x_train_over)
metrics_score(y_train_over, y_pred_train_CBOver)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3340
           1       1.00      1.00      1.00      2672

    accuracy                           1.00      6012
   macro avg       1.00      1.00      1.00      6012
weighted avg       1.00      1.00      1.00      6012


y_pred_test_CBOver = catboost_tuned_over.predict(x_test)
metrics_score(y_test, y_pred_test_CBOver)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1431
           1       0.98      0.83      0.90       357

    accuracy                           0.96      1788
   macro avg       0.97      0.91      0.94      1788
weighted avg       0.96      0.96      0.96      1788


# Model Performance on the test data
catboost_tuned_over_test = model_performance_classification(catboost_tuned_over,x_test,y_test)

catboost_tuned_over_test


kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# CatBoost Classifier
catboost = CatBoostClassifier(random_state=42, verbose=0)

# Grid of parameters to choose from
params_catboost = {
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "iterations": [50, 100, 200, 300, 500],
    "depth": [2, 3, 5, 7, 9],
    "l2_leaf_reg": [1, 3, 5, 7, 9],
    "border_count": [32, 64, 128, 254],
    "bagging_temperature": [0, 0.1, 0.2, 0.3],
}

# Run the randomized search
random_obj = RandomizedSearchCV(catboost, params_catboost, scoring='recall', cv=kfold, n_jobs=-1, n_iter=100, random_state=42)
random_objCat = random_obj.fit(xl_train_over, yl_train_over)

# Set the classifier to the best combination of parameters
catboost_tuned_overCat = random_objCat.best_estimator_

# Fit the model
catboost_tuned_overCat.fit(xl_train_over, yl_train_over)

yl_pred_train_CBOverCat = catboost_tuned_overCat.predict(xl_train_over)
metrics_score(yl_train_over, yl_pred_train_CBOverCat)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3340
           1       1.00      1.00      1.00      2672

    accuracy                           1.00      6012
   macro avg       1.00      1.00      1.00      6012
weighted avg       1.00      1.00      1.00      6012


yl_pred_test_CBOverCat = catboost_tuned_overCat.predict(xl_test)
metrics_score(yl_test, yl_pred_test_CBOverCat)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1431
           1       0.99      0.83      0.91       357

    accuracy                           0.97      1788
   macro avg       0.98      0.92      0.94      1788
weighted avg       0.97      0.97      0.96      1788


# Model Performance on the test data
catboost_tuned_over_testCat = model_performance_classification(catboost_tuned_overCat,xl_test,yl_test)

catboost_tuned_over_testCat


# LightGBM Classifier
lgbm = lgb.LGBMClassifier(random_state=42)

# Grid of parameters to choose from
params_lgbm = {
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "n_estimators": [50, 100, 200, 300, 500],
    "max_depth": [2, 3, 5, 7, 9],
    "num_leaves": [15, 31, 63, 127],
    "min_child_samples": [10, 20, 30, 50],
    "subsample": [0.5, 0.6, 0.8, 1.0],
    "colsample_bytree": [0.5, 0.6, 0.8, 1.0],
}

# Run the randomized search
random_obj = RandomizedSearchCV(lgbm, params_lgbm, scoring='recall', cv=kfold, n_jobs=-1, n_iter=100, random_state=42)

random_obj = random_obj.fit(x_train_over, y_train_over)

# Set the classifier to the best combination of parameters
lgbm_tuned_over = random_obj.best_estimator_

# Fit the model
lgbm_tuned_over.fit(x_train_over, y_train_over)

y_pred_train_LGBMOver = lgbm_tuned_over.predict(x_train_over)
metrics_score(y_train_over, y_pred_train_LGBMOver)


# Model Performance on the test data
#lgbm_perf_test = model_performance_classification(lgbm_tuned, x_test, y_test)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3340
           1       1.00      1.00      1.00      2672

    accuracy                           1.00      6012
   macro avg       1.00      1.00      1.00      6012
weighted avg       1.00      1.00      1.00      6012


y_pred_test_LGBMOver = lgbm_tuned_over.predict(x_test)
metrics_score(y_test, y_pred_test_LGBMOver)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1431
           1       0.98      0.82      0.89       357

    accuracy                           0.96      1788
   macro avg       0.97      0.91      0.94      1788
weighted avg       0.96      0.96      0.96      1788


# Model Performance on the test data
lgbm_tuned_over_test = model_performance_classification(lgbm_tuned_over,x_test,y_test)

lgbm_tuned_over_test


models_test_comp_df = pd.concat(
    
    [    
   d_tree_test.T, d_tree_tuned_test.T, rf_estimator_test.T, 
    rf_estimator_tuned_test.T, rf_estimator_tuned_2_test.T,
    catboost_test.T, catboost_tuned_test.T, lgbm_test.T, 
    lgbm_tuned_test.T, xgb_test.T, lgbm_tuned_under_test.T, 
    catboost_tuned_under_test.T, catboost_tuned_over_test.T, 
    lgbm_tuned_over_test.T, lgbm_test_cat.T, catboost_tuned_over_testCat.T
    ],
    
    axis = 1,
)

models_test_comp_df.columns = [
    "Decision Tree classifier",
    "Tuned Decision Tree classifier",
    "Random Forest classifier",
    "Tuned Random Forest classifier",
    "Tuned Random Forest classifier 2",
    "Catboost classifier",
    "Tuned Catboost classifier",
    "lgbm classifier",
    "Tuned lgbm classifier",
    "XGBoost classifier",
    "Tuned lgbm classifier undersampling",
    "Tuned Catboost classifier undersampling",
    "Tuned Catboost classifier oversampling",
    "Tuned lgbm classifier oversampling", "Lgbm classifier without one-hot encoding",
    "Tuned Catboost classifier oversampling without one-hot encoding"
  
]

print("Test performance comparison:")
print(models_test_comp_df)

Test performance comparison:
           Decision Tree classifier  Tuned Decision Tree classifier  \
Precision                  0.926800                        0.850695   
Recall                     0.907589                        0.883830   
Accuracy                   0.947987                        0.909955   

           Random Forest classifier  Tuned Random Forest classifier  \
Precision                  0.968722                        0.968213   
Recall                     0.896361                        0.872200   
Accuracy                   0.956935                        0.948546   

           Tuned Random Forest classifier 2  Catboost classifier  \
Precision                          0.919890             0.973851   
Recall                             0.861016             0.904063   
Accuracy                           0.932327             0.960850   

           Tuned Catboost classifier  lgbm classifier  Tuned lgbm classifier  \
Precision                   0.974845         0.976880               0.971460   
Recall                      0.908265         0.910015               0.907566   
Accuracy                    0.962528         0.963647               0.961409   

           XGBoost classifier  Tuned lgbm classifier undersampling  \
Precision            0.972473                             0.879837   
Recall               0.905114                             0.902369   
Accuracy             0.960850                             0.927852   

           Tuned Catboost classifier undersampling  \
Precision                                 0.917862   
Recall                                    0.908996   
Accuracy                                  0.945190   

           Tuned Catboost classifier oversampling  \
Precision                                0.971183   
Recall                                   0.912819   
Accuracy                                 0.963087   

           Tuned lgbm classifier oversampling  \
Precision                            0.968831   
Recall                               0.909668   
Accuracy                             0.961409   

           Lgbm classifier without one-hot encoding  \
Precision                                  0.975177   
Recall                                     0.909665   
Accuracy                                   0.963087   

           Tuned Catboost classifier oversampling without one-hot encoding  
Precision                                           0.975178                
Recall                                              0.916319                
Accuracy                                            0.965324


# Compute the F1-score for each model
models_recall = models_test_comp_df.loc["Recall"].sort_values(ascending=False)
models_precision = models_test_comp_df.loc["Precision"].sort_values(ascending=False)
models_accuracy = models_test_comp_df.loc["Accuracy"].sort_values(ascending=False)

# Create a dataframe for the model rankings
model_ranking = pd.DataFrame({
    "Model": models_recall.index, 
    "Recall Rank": range(1, len(models_recall)+1),
    "Precision Rank": [models_precision.index.get_loc(m)+1 for m in models_recall.index],
    "Accuracy Rank": [models_accuracy.index.get_loc(m)+1 for m in models_recall.index]
})

# Compute the total rank for each model
model_ranking["Total Rank"] = model_ranking.sum(axis=1)

# Sort by the total rank
model_ranking = model_ranking.sort_values("Total Rank")

# Set the index of the dataframe
model_ranking.set_index("Model", inplace=True)

# Display the model rankings
print("Model Rankings:")
print(model_ranking)

Model Rankings:
                                                    Recall Rank  \
Model                                                             
Tuned Catboost classifier oversampling without ...            1   
lgbm classifier                                               3   
Lgbm classifier without one-hot encoding                      5   
Tuned Catboost classifier oversampling                        2   
Tuned Catboost classifier                                     7   
Tuned lgbm classifier oversampling                            4   
Tuned lgbm classifier                                         9   
Catboost classifier                                          11   
XGBoost classifier                                           10   
Decision Tree classifier                                      8   
Tuned Catboost classifier undersampling                       6   
Random Forest classifier                                     13   
Tuned Random Forest classifier                               15   
Tuned lgbm classifier undersampling                          12   
Tuned Random Forest classifier 2                             16   
Tuned Decision Tree classifier                               14   

                                                    Precision Rank  \
Model                                                                
Tuned Catboost classifier oversampling without ...               2   
lgbm classifier                                                  1   
Lgbm classifier without one-hot encoding                         3   
Tuned Catboost classifier oversampling                           8   
Tuned Catboost classifier                                        4   
Tuned lgbm classifier oversampling                               9   
Tuned lgbm classifier                                            7   
Catboost classifier                                              5   
XGBoost classifier                                               6   
Decision Tree classifier                                        12   
Tuned Catboost classifier undersampling                         14   
Random Forest classifier                                        10   
Tuned Random Forest classifier                                  11   
Tuned lgbm classifier undersampling                             15   
Tuned Random Forest classifier 2                                13   
Tuned Decision Tree classifier                                  16   

                                                    Accuracy Rank  Total Rank  
Model                                                                          
Tuned Catboost classifier oversampling without ...              1           4  
lgbm classifier                                                 2           6  
Lgbm classifier without one-hot encoding                        4          12  
Tuned Catboost classifier oversampling                          3          13  
Tuned Catboost classifier                                       5          16  
Tuned lgbm classifier oversampling                              7          20  
Tuned lgbm classifier                                           6          22  
Catboost classifier                                             8          24  
XGBoost classifier                                              9          25  
Decision Tree classifier                                       12          32  
Tuned Catboost classifier undersampling                        13          33  
Random Forest classifier                                       10          33  
Tuned Random Forest classifier                                 11          37  
Tuned lgbm classifier undersampling                            15          42  
Tuned Random Forest classifier 2                               14          43  
Tuned Decision Tree classifier                                 16          46


from catboost import Pool
# Create DataFrame with feature importance ranking
feat_imp = pd.DataFrame({
    'feature': X.columns,
    'importance': catboost_tuned_over.feature_importances_
})

feat_imp = feat_imp.sort_values(by='importance', ascending=False)

# Plot feature importance ranking
plt.figure(figsize=(12,8))
sns.barplot(x='importance', y='feature', data=feat_imp)
plt.title('Tuned Catboost classifier with oversampling Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()


# Select top 5 features based on importance
top_features = feat_imp.head(10)['feature'].values
X_top = X[top_features]

# Train a model with only the top features
catboost_top = CatBoostClassifier(random_state=42, verbose=0)
catboost_top.fit(x_train_over[top_features], y_train_over)

<catboost.core.CatBoostClassifier at 0x172db299490>


y_pred_train_top = catboost_top.predict(x_train_over)
metrics_score(y_train_over, y_pred_train_top)

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3340
           1       1.00      0.99      0.99      2672

    accuracy                           1.00      6012
   macro avg       1.00      0.99      1.00      6012
weighted avg       1.00      1.00      1.00      6012


y_pred_test_CBTOP = catboost_top.predict(x_test)
metrics_score(y_test, y_pred_test_CBTOP)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1431
           1       0.99      0.82      0.90       357

    accuracy                           0.96      1788
   macro avg       0.97      0.91      0.94      1788
weighted avg       0.96      0.96      0.96      1788


# Model Performance on the test data
CBtop_test = model_performance_classification(catboost_top,x_test,y_test)
CBtop_test


from scipy.stats import chi2

# Confusion matrix for catboost_top
#y_pred_test_top = catboost_top.predict(x_test)
confusion_top = confusion_matrix(y_test, y_pred_test_top)

# Confusion matrix for catboost_tuned_over
#y_pred_test_over = catboost_tuned_over.predict(x_test)
confusion_over = confusion_matrix(y_test, y_pred_test_over)

# Calculate contingency table
a = np.sum(np.logical_and(y_pred_test_top == 1, y_pred_test_over == 1))
b = np.sum(np.logical_and(y_pred_test_top == 0, y_pred_test_over == 1))
c = np.sum(np.logical_and(y_pred_test_top == 1, y_pred_test_over == 0))
d = np.sum(np.logical_and(y_pred_test_top == 0, y_pred_test_over == 0))

# Calculate McNemar's test statistic
chi_squared = ((b - c) ** 2) / (b + c)

# Calculate p-value
p_value = 1 - chi2.cdf(chi_squared, 1)

print(f"McNemar's test statistic: {chi_squared:.4f}")
print(f"p-value: {p_value:.4f}")

McNemar's test statistic: 98.2704
p-value: 0.0000


# Get feature importance
feat_imp = pd.DataFrame({
    'feature': x_train.columns,
    'importance': lgbm.feature_importances_
}).sort_values(by='importance', ascending=False)

# Plot feature importance ranking
plt.figure(figsize=(12,8))
sns.barplot(x='importance', y='feature', data=feat_imp)
plt.title('LGB model Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()


# Get feature importance
feat_imp = pd.DataFrame({
    'feature': xl_train.columns,
    'importance': lgbmCat.feature_importances_
}).sort_values(by='importance', ascending=False)

# Plot feature importance ranking
plt.figure(figsize=(12,8))
sns.barplot(x='importance', y='feature', data=feat_imp)
plt.title('LGB model without one-hot encoding Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()


# Select top 5 features based on importance
top_features = feat_imp.head(13)['feature'].values
X_top = X[top_features]

# Train a model with only the top features
lgbm_top = LGBMClassifier(random_state=42, verbose=0)
lgbm_top.fit(x_train[top_features], y_train)

[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000463 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.

LGBMClassifier(random_state=42, verbose=0)

LGBMClassifier(random_state=42, verbose=0)


# Select top 5 features based on importance
top_features = feat_imp.head(13)['feature'].values
X_top = X[top_features]

# Train a model with only the top features
lgbm_top = LGBMClassifier(random_state=42, verbose=0)
lgbm_top.fit(x_train[top_features], y_train)

[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000472 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.

LGBMClassifier(random_state=42, verbose=0)

LGBMClassifier(random_state=42, verbose=0)


y_pred_test_lgbmTOP = lgbm_top.predict(x_test[top_features])
metrics_score(y_test, y_pred_test_lgbmTOP)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1431
           1       0.99      0.82      0.90       357

    accuracy                           0.96      1788
   macro avg       0.97      0.91      0.94      1788
weighted avg       0.96      0.96      0.96      1788


# Confusion matrix for catboost_tuned_over
confusion_CBOver = confusion_matrix(y_test, y_pred_test_CBOver)

# Confusion matrix for lgbm_tuned_over_test
confusion_LGBM = confusion_matrix(y_test, yl_pred_test_lgbm)

# Calculate contingency table
a = np.sum(np.logical_and(y_pred_test_CBOver == 1, y_pred_test_lgbm == 1))
b = np.sum(np.logical_and(y_pred_test_CBOver == 0, y_pred_test_lgbm == 1))
c = np.sum(np.logical_and(y_pred_test_CBOver == 1, y_pred_test_lgbm == 0))
d = np.sum(np.logical_and(y_pred_test_CBOver == 0, y_pred_test_lgbm == 0))

# Calculate McNemar's test statistic
chi_squared = ((b - c) ** 2) / (b + c)

# Calculate p-value
p_value = 1 - chi2.cdf(chi_squared, 1)

print(f"McNemar's test statistic: {chi_squared:.8f}, p-value: {p_value:.8f}")

McNemar's test statistic: 2.57894737, p-value: 0.10829366


from sklearn.metrics import roc_auc_score, roc_curve

# Calculate AUC-ROC for catboost_tuned_over
y_pred_proba_CBOver = catboost_tuned_over.predict_proba(x_test)[:, 1]
aucroc_CBOver = roc_auc_score(y_test, y_pred_proba_CBOver)
fpr_CBOver, tpr_CBOver, thresholds_CBOver = roc_curve(y_test, y_pred_proba_CBOver)

# Calculate AUC-ROC for lgbm_tuned_over
y_pred_proba_LGBM = lgbm.predict_proba(x_test)[:, 1]
aucroc_LGBM = roc_auc_score(y_test, y_pred_proba_LGBM)
fpr_LGBM, tpr_LGBM, thresholds_LGBM = roc_curve(y_test, y_pred_proba_LGBM)

# Plot ROC curves for the two models
plt.plot(fpr_CBOver, tpr_CBOver, label=f'Catboost Tuned Over (AUC-ROC = {aucroc_CBOver:.4f})')
plt.plot(fpr_LGBM, tpr_LGBM, label=f'LGBM (AUC-ROC = {aucroc_LGBM:.4f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend(loc='lower right')
plt.show()


from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score

# Define the scoring metric as F1-score
scorer = make_scorer(f1_score)

# Perform 5-fold cross-validation on Catboost Tuned Over model
catboost_scores = cross_val_score(catboost_tuned_over, X, Y, cv=5, scoring=scorer)

# Perform 5-fold cross-validation on LGBM Tuned Over model
lgbm_scores = cross_val_score(lgbm, X, Y, cv=5, scoring=scorer)

# Compute the mean F1-score for each model
catboost_mean_f1 = catboost_scores.mean()
lgbm_mean_f1 = lgbm_scores.mean()

print(f"Catboost Tuned Over mean F1-score: {catboost_mean_f1:.4f}")
print(f"LGBM mean F1-score: {lgbm_mean_f1:.4f}")

Catboost Tuned Over mean F1-score: 0.6492
LGBM mean F1-score: 0.6315


from sklearn.model_selection import cross_val_score
from scipy.stats import ttest_rel

# define models to compare
models = []
models.append(('Catboost', catboost_tuned_over))
models.append(('LGBM', lgbm))

# evaluate each model in turn using cross-validation
results = []
names = []
for name, model in models:
    scores = cross_val_score(model, x_train_over, y_train_over, cv=5, scoring='f1_macro')
    results.append(scores)
    names.append(name)
    print(f"{name}: F1-score mean={scores.mean():.4f}, std={scores.std():.4f}")

# perform paired t-test to compare the means of F1-scores
stat, p = ttest_rel(results[0], results[1])
alpha = 0.05
print('Paired t-test:')
print(f"statistic={stat:.4f}, p-value={p:.4f}")
if p > alpha:
    print('The difference between the models is not statistically significant (fail to reject H0)')
else:
    print('The difference between the models is statistically significant (reject H0)')

Catboost: F1-score mean=0.9931, std=0.0058
LGBM: F1-score mean=0.9914, std=0.0047
Paired t-test:
statistic=2.3902, p-value=0.0752
The difference between the models is not statistically significant (fail to reject H0)


import time

start_time = time.time()

# Time to train the model
lgbm.fit(x_train, y_train)

end_time = time.time()

print("LGBM. Time to train the model:", end_time - start_time, "sec")

LGBM. Time to train the model: 0.388735294342041 sec


import time

start_time = time.time()

# Time to train the model
lgbmCat.fit(xl_train, yl_train)

end_time = time.time()

print("LGBM without one-hot encoding. Time to train the model:", end_time - start_time, "sec")

LGBM without one-hot encoding. Time to train the model: 0.3463006019592285 sec


import time

start_time = time.time()

# Time to train the model
catboost_tuned_over.fit(x_test, y_test)

end_time = time.time()

print("Catboost. Time to train the model:", end_time - start_time, "sec")

Catboost. Time to train the model: 1.3230578899383545 sec


import time

start_time = time.time()

# Time to train the model
catboost_tuned_overCat.fit(xl_train_over, yl_train_over)

end_time = time.time()

print("Catboost without one-hot encoding. Time to train the model:", end_time - start_time, "sec")

Catboost without one-hot encoding. Time to train the model: 1.0111026763916016 sec


## Separate the target variable and other variables
YL = dataLGB.BAD
XL = dataLGB.drop(['BAD'], axis=1)

# Split the data
xl_train, xl_test, yl_train, yl_test = train_test_split(XL, YL, test_size=0.3, random_state=1, stratify=YL)

# LightGBM Classifier
lgbmCat = lgb.LGBMClassifier(random_state=42)

# Fit the model
lgbmCat.fit(xl_train, yl_train)

# Make predictions on train set
yl_pred_train_lgbm = lgbmCat.predict(xl_train)

# Compute metrics
metrics_score(yl_train, yl_pred_train_lgbm)

yl_pred_test_lgbm = lgbmCat.predict(xl_test)

# Compute metrics
metrics_score(yl_test, yl_pred_test_lgbm)

# Predict probability of default
yl_pred_proba_lgbm = lgbmCat.predict_proba(xl_test)[:, 1]

# Set threshold for probability of default
threshold = 0.15

# Create a DataFrame with the test data and the predicted probability of default
result_df = pd.DataFrame({'yl_test': yl_test, 'yl_pred_proba_lgbm': yl_pred_proba_lgbm})

# Add a column to the DataFrame indicating whether the loan was approved or not based on the threshold
result_df['approved'] = np.where(result_df['yl_pred_proba_lgbm'] < threshold, 1, 0)

# convert probabilities to binary predictions using threshold
y_pred = (yl_pred_proba_lgbm >= threshold).astype(int)

# evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Compute metrics
metrics_score(y_test, y_pred)

# print the evaluation metrics
print('Accuracy:', accuracy)
print('Recall:', recall)
print('Precision:', precision)
print('F1-Score:', f1)

# Display the DataFrame
print(result_df.head(100))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3340
           1       1.00      1.00      1.00       832

    accuracy                           1.00      4172
   macro avg       1.00      1.00      1.00      4172
weighted avg       1.00      1.00      1.00      4172

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1431
           1       0.99      0.82      0.90       357

    accuracy                           0.96      1788
   macro avg       0.98      0.91      0.94      1788
weighted avg       0.96      0.96      0.96      1788

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1431
           1       0.96      0.87      0.91       357

    accuracy                           0.97      1788
   macro avg       0.96      0.93      0.95      1788
weighted avg       0.97      0.97      0.97      1788

Accuracy: 0.9670022371364653
Recall: 0.8711484593837535
Precision: 0.9598765432098766
F1-Score: 0.9133627019089574
      yl_test  yl_pred_proba_lgbm  approved
4394        0            0.002379         1
5000        0            0.018923         1
2786        0            0.003440         1
2256        0            0.005746         1
114         0            0.307778         0
...       ...                 ...       ...
3673        0            0.005667         1
4683        0            0.004449         1
665         0            0.070435         1
2928        0            0.007149         1
4012        0            0.004060         1

[100 rows x 3 columns]

	count	mean	std	min	25%	50%	75%	max
BAD	5960.0	0.199497	0.399656	0.000000	0.000000	0.000000	0.000000	1.000000
LOAN	5960.0	18607.969799	11207.480417	1100.000000	11100.000000	16300.000000	23300.000000	89900.000000
MORTDUE	5442.0	73760.817200	44457.609458	2063.000000	46276.000000	65019.000000	91488.000000	399550.000000
VALUE	5848.0	101776.048741	57385.775334	8000.000000	66075.500000	89235.500000	119824.250000	855909.000000
YOJ	5445.0	8.922268	7.573982	0.000000	3.000000	7.000000	13.000000	41.000000
DEROG	5252.0	0.254570	0.846047	0.000000	0.000000	0.000000	0.000000	10.000000
DELINQ	5380.0	0.449442	1.127266	0.000000	0.000000	0.000000	0.000000	15.000000
CLAGE	5652.0	179.766275	85.810092	0.000000	115.116702	173.466667	231.562278	1168.233561
NINQ	5450.0	1.186055	1.728675	0.000000	0.000000	1.000000	2.000000	17.000000
CLNO	5738.0	21.296096	10.138933	0.000000	15.000000	20.000000	26.000000	71.000000
DEBTINC	4693.0	33.779915	8.601746	0.524499	29.140031	34.818262	39.003141	203.312149

	BAD	LOAN	MORTDUE	VALUE	REASON	JOB	YOJ	DEROG	DELINQ	CLAGE	NINQ	CLNO	DEBTINC
0	1	1100	25860.0	39025.0	HomeImp	Other	10.5	0.0	0.0	94.366667	1.0	9.0	NaN
1	1	1300	70053.0	68400.0	HomeImp	Other	7.0	0.0	2.0	121.833333	0.0	14.0	NaN
2	1	1500	13500.0	16700.0	HomeImp	Other	4.0	0.0	0.0	149.466667	1.0	10.0	NaN
3	1	1500	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	0	1700	97800.0	112000.0	HomeImp	Office	3.0	0.0	0.0	93.333333	0.0	14.0	NaN

	LOAN	MORTDUE	VALUE	REASON	JOB	YOJ	CLAGE	CLNO	DEBTINC
5955	88900	57264.0	90185.0	DebtCon	Other	16.0	221.808718	16.0	36.112347
5956	89000	54576.0	92937.0	DebtCon	Other	16.0	208.692070	15.0	35.859971
5957	89200	54045.0	92924.0	DebtCon	Other	15.0	212.279697	15.0	35.556590
5958	89800	50370.0	91861.0	DebtCon	Other	14.0	213.892709	16.0	34.340882
5959	89900	48811.0	88934.0	DebtCon	Other	15.0	219.601002	16.0	34.571519

	% of Bad Loans
JOB
Sales	34.862385
Self	30.051813
Mgr	23.337679
Other	23.199330
ProfExe	16.614420
Office	13.185654

	BAD	LOAN	MORTDUE	VALUE	REASON	JOB	YOJ	DELINQ	CLAGE	NINQ	CLNO	DEBTINC
0	1	1100	25860.0000	39025.000000	1	2	10.5	0.0	94.366667	1.0	9.0	38.079762
1	1	1300	70053.0000	68400.000000	1	2	7.0	2.0	121.833333	0.0	14.0	38.079762
2	1	1500	13500.0000	16700.000000	1	2	4.0	0.0	149.466667	1.0	10.0	38.079762
3	1	1500	73760.8172	101776.048741	0	2	6.0	0.0	132.866667	1.0	20.0	38.079762
4	0	1700	97800.0000	112000.000000	1	4	3.0	0.0	93.333333	0.0	14.0	34.541671

	0
DELINQ	8.116356
DEBTINC	7.968688
DEROG	4.680553
NINQ	2.142448
VALUE	1.446299
JOB_Sales	0.617306
JOB_Self	0.399218
REASON_HomeImp	0.274034
JOB_Other	-0.061200
JOB_ProfExe	-0.179119
JOB_Office	-0.471483
CLNO	-0.553631
YOJ	-0.589721
MORTDUE	-0.814848
LOAN	-1.442644
CLAGE	-4.547162

	odds
DELINQ	3348.796080
DEBTINC	2889.063168
DEROG	107.829671
NINQ	8.520267
VALUE	4.247367
JOB_Sales	1.853927
JOB_Self	1.490658
REASON_HomeImp	1.315259
JOB_Other	0.940635
JOB_ProfExe	0.836006
JOB_Office	0.624076
CLNO	0.574859
YOJ	0.554482
MORTDUE	0.442707
LOAN	0.236302
CLAGE	0.010597

Loan Default Prediction¶

Problem Definition¶

The Context:¶

The objective:¶

The key questions:¶

The problem formulation:¶

Data Description:¶

Import the necessary libraries and Data¶

Data Overview¶

Summary Statistics¶

Observations from Summary Statistics¶

Exploratory Data Analysis (EDA) and Visualization¶

Univariate Analysis¶

1. What is the range of values for the loan amount variable "LOAN"?¶

2. How does the distribution of years at present job "YOJ" vary across the dataset?¶

3. How many unique categories are there in the JOB and the REASON variable?¶

4. What is the most common category in the JOB and the REASON variable?¶

Bivariate Analysis¶

5. Is there a relationship between the REASON variable and the proportion of applicants who defaulted on their loan?¶

The results of the chi-squared test indicates that there is a statistically significant relationship between the REASON and BAD variables.¶

6. Do applicants who default have a significantly different loan amount compared to those who repay their loan?¶

Based on the results of the t-test, applicants who default do have a significantly different loan amount compared to those who repay their loan.¶

7. Is there a correlation between the value of the property and the loan default rate?¶

8. Do applicants who default have a significantly different mortgage amount compared to those who repay their loan?¶

Based on the results of the t-test, there is evidence to suggest that applicants who default have a significantly different mortgage amount compared to those who repay their loan.¶

We can conclude that there is a significant association between the job type and the probability of a customer defaulting on a loan. This is also confirmed by the default rates for each profession. The highest default rates are among sales and self-employed occupations.¶

If a borrower has 6 or more delinquent credit lines, the probability of default approaches 100%¶

If a borrower has 7 or more major derogatory reports, the probability of default approaches 100%¶

Is there a relationship between the age of the oldest credit line in months and loan default?¶

Is there a relationship between debt-to-income ratio and loan default?¶

Multivariate Analysis¶

Important Insights from EDA¶

Treating Missing Values¶

Model Building - Approach¶

Create copy for LGBM Classifier¶

Converting categorical columns using the get_dummies function¶

Treatment of outliers¶

Logistic Regression¶

Decision Tree¶

Decision Tree - Hyperparameter Tuning¶

Building a Random Forest Classifier¶

Random Forest Classifier Hyperparameter Tuning¶

CatBoostClassifier, LightGBM Classifier, XGBClassifier¶

1. Comparison of various techniques and their relative performance based on chosen Metric (Measure of success):**¶

This table provides a comparison of the test performance of various classifiers on the chosen metrics: precision, recall, and accuracy.¶

Let's check whether it is possible to improve the tuned Catboost classifier with oversampling by selecting the most important features¶

To compare the performance of the two models more rigorously, we can perform a statistical test such as McNemar's test. This test compares the performance of two models on the same data and can determine if one model performs significantly better than the other¶

Let's check the most important features of the model "lgbm" by selecting¶

McNemar's test between "catboost_tuned_over" and "lgbm"¶

The ROC curve is a plot that shows the relationship between the true positive rate and false positive rate as the classification threshold is varied.¶

We can compare the performance of the models using cross-validation and a statistical test:¶

2. Refined insights:¶

3. Proposal for the final solution design:¶

Set a threshold for the model's predicted probability of default¶

Executive Summary:¶

Problem and Solution Summary:¶

Recommendations for Implementation:¶