import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


df = pd.read_csv("train.csv")
print("\nRows and columns:\n",df.shape)
print("\nColumn titles:\n",df.columns)
print("\nDuplicates:\n", df.duplicated().value_counts())

Rows and columns:
 (100000, 28)

Column titles:
 Index(['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Occupation',
       'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Type_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance',
       'Credit_Score'],
      dtype='object')

Duplicates:
 False    100000
Name: count, dtype: int64


df.columns = [x.lower() for x in df.columns]
df.drop(['id', 'customer_id', 'month', 'name','ssn', 'changed_credit_limit'], axis=1, inplace = True)
print(df.shape)
print(df.columns)

(100000, 22)
Index(['age', 'occupation', 'annual_income', 'monthly_inhand_salary',
       'num_bank_accounts', 'num_credit_card', 'interest_rate', 'num_of_loan',
       'type_of_loan', 'delay_from_due_date', 'num_of_delayed_payment',
       'num_credit_inquiries', 'credit_mix', 'outstanding_debt',
       'credit_utilization_ratio', 'credit_history_age',
       'payment_of_min_amount', 'total_emi_per_month',
       'amount_invested_monthly', 'payment_behaviour', 'monthly_balance',
       'credit_score'],
      dtype='object')


df.isnull().sum()

age                             0
occupation                      0
annual_income                   0
monthly_inhand_salary       15002
num_bank_accounts               0
num_credit_card                 0
interest_rate                   0
num_of_loan                     0
type_of_loan                11408
delay_from_due_date             0
num_of_delayed_payment       7002
num_credit_inquiries         1965
credit_mix                      0
outstanding_debt                0
credit_utilization_ratio        0
credit_history_age           9030
payment_of_min_amount           0
total_emi_per_month             0
amount_invested_monthly      4479
payment_behaviour               0
monthly_balance              1200
credit_score                    0
dtype: int64


df.drop(['monthly_inhand_salary', 'type_of_loan', 'credit_history_age'], axis=1, inplace = True)


df['num_of_delayed_payment'] = df['num_of_delayed_payment'].replace(np.nan, 0)

print(df['num_of_delayed_payment'].value_counts().sort_values())

num_of_delayed_payment
2913       1
1473       1
4216       1
3404       1
2903       1
        ... 
10      5153
16      5173
17      5261
19      5327
0       7002
Name: count, Length: 750, dtype: int64


df['num_of_delayed_payment'] = [float(str(x).replace('_','')) for x in df['num_of_delayed_payment']]

df['num_of_delayed_payment'][df['num_of_delayed_payment'] < 0] = 0

print(df['num_of_delayed_payment'].value_counts().sort_index(),"\n")
print(df['num_of_delayed_payment'].value_counts().sort_values())

num_of_delayed_payment
0.0       9255
1.0       1636
2.0       1810
3.0       1931
4.0       1838
          ... 
4344.0       1
4360.0       1
4384.0       1
4388.0       1
4397.0       1
Name: count, Length: 708, dtype: int64 

num_of_delayed_payment
3243.0       1
2185.0       1
663.0        1
904.0        1
306.0        1
          ... 
10.0      5309
16.0      5312
17.0      5412
19.0      5481
0.0       9255
Name: count, Length: 708, dtype: int64


print(df['num_credit_inquiries'].value_counts().sort_index(),"\n")
print(df['num_credit_inquiries'].value_counts().sort_values(),"\n\n")

print("Number of null values initailly: ", df.isnull()['num_credit_inquiries'].sum(),"\n")
df['num_credit_inquiries'] = df['num_credit_inquiries'].replace(np.nan, df['num_credit_inquiries'].mean())
print("Null values after correction: ", df.isnull()['num_credit_inquiries'].sum())

num_credit_inquiries
0.0        6972
1.0        7588
2.0        8028
3.0        8890
4.0       11271
          ...  
2588.0        1
2589.0        2
2592.0        2
2594.0        1
2597.0        1
Name: count, Length: 1223, dtype: int64 

num_credit_inquiries
2186.0        1
115.0         1
620.0         1
1232.0        1
2055.0        1
          ...  
2.0        8028
7.0        8058
6.0        8111
3.0        8890
4.0       11271
Name: count, Length: 1223, dtype: int64 


Number of null values initailly:  1965 

Null values after correction:  0


df['amount_invested_monthly'] = df['amount_invested_monthly'].replace(np.nan,0)

# print(df['amount_invested_monthly'].value_counts().sort_index())
print("+ Before correction:\n", df['amount_invested_monthly'].value_counts().sort_values(),"\n\n")

df['amount_invested_monthly'] = [float(str(x).replace('_','')) for x in df['amount_invested_monthly']]

print("+ After correction:\n", df['amount_invested_monthly'].value_counts().sort_index(),"\n")
print(df['amount_invested_monthly'].value_counts().sort_values())

+ Before correction:
 amount_invested_monthly
135.8228957996246        1
24.785216509052056       1
104.291825168246         1
40.39123782853101        1
263.17416316163934       1
                      ... 
70.86997036607373        1
180.7330951944497        1
0.0                    169
__10000__             4305
0                     4479
Name: count, Length: 91050, dtype: int64 


+ After correction:
 amount_invested_monthly
0.000000        4648
10.010194          1
10.011425          1
10.036600          1
10.053768          1
                ... 
1941.237454        1
1944.520747        1
1961.218850        1
1977.326102        1
10000.000000    4305
Name: count, Length: 91049, dtype: int64 

amount_invested_monthly
82.555905          1
24.785217          1
104.291825         1
40.391238          1
263.174163         1
                ... 
125.956592         1
70.869970          1
180.733095         1
10000.000000    4305
0.000000        4648
Name: count, Length: 91049, dtype: int64


df['monthly_balance'] = df['monthly_balance'].replace(np.nan,0)

# print(df['amount_invested_monthly'].value_counts().sort_index())
print("+ Before correction:\n", df['monthly_balance'].value_counts().sort_values(),"\n\n")

df['monthly_balance'] = [float(str(x).replace('__-333333333333333333333333333__','0')) for x in df['monthly_balance']]

print("+ After correction:\n", df['monthly_balance'].value_counts().sort_index(),"\n")
print(df['monthly_balance'].value_counts().sort_values())

df['amount_invested_monthly'] = df['amount_invested_monthly'].replace(10000,0)

+ Before correction:
 monthly_balance
220.09076614959645                     1
244.5653167062043                      1
358.12416760938714                     1
470.69062692529184                     1
484.5912142650067                      1
                                    ... 
394.6249135424154                      1
310.8838062920152                      1
189.64108011929028                     1
__-333333333333333333333333333__       9
0                                   1200
Name: count, Length: 98793, dtype: int64 


+ After correction:
 monthly_balance
0.000000       1209
0.007760          1
0.088628          1
0.095482          1
0.131136          1
               ... 
1564.134826       1
1566.613165       1
1567.208309       1
1576.288935       1
1602.040519       1
Name: count, Length: 98792, dtype: int64 

monthly_balance
220.090766       1
340.479212       1
244.565317       1
358.124168       1
470.690627       1
              ... 
307.659291       1
381.935842       1
394.624914       1
348.088383       1
0.000000      1209
Name: count, Length: 98792, dtype: int64


print(df.isnull().sum(), "\n\n")

df.select_dtypes(include=('object'))

age                         0
occupation                  0
annual_income               0
num_bank_accounts           0
num_credit_card             0
interest_rate               0
num_of_loan                 0
delay_from_due_date         0
num_of_delayed_payment      0
num_credit_inquiries        0
credit_mix                  0
outstanding_debt            0
credit_utilization_ratio    0
payment_of_min_amount       0
total_emi_per_month         0
amount_invested_monthly     0
payment_behaviour           0
monthly_balance             0
credit_score                0
dtype: int64


# removing characters
df['age'] = [float(str(x).replace('_','')) for x in df['age']]
# replacing non acceptable values the column with average
mean_age = df['age'][df['age']<99].mean()
df['age'][df['age'] > 99] = mean_age
df['age'][df['age'] < 0] = mean_age


# removing characters
df['annual_income'] = [float(str(x).replace('_','')) for x in df['annual_income']]


df['num_of_loan'] = [float(str(x).replace('_','')) for x in df['num_of_loan']]


df['outstanding_debt'] = [float(str(x).replace('_','')) for x in df['outstanding_debt']]


[print(df[col].value_counts(),"\n\n") for col in df.select_dtypes(include=('object'))]

occupation
Unemployed       7062
Lawyer           6575
Architect        6355
Engineer         6350
Scientist        6299
Mechanic         6291
Accountant       6271
Developer        6235
Media_Manager    6232
Teacher          6215
Entrepreneur     6174
Doctor           6087
Journalist       6085
Manager          5973
Musician         5911
Writer           5885
Name: count, dtype: int64 


credit_mix
Standard    36479
Good        24337
_           20195
Bad         18989
Name: count, dtype: int64 


payment_of_min_amount
Yes    52326
No     35667
NM     12007
Name: count, dtype: int64 


payment_behaviour
Low_spent_Small_value_payments      25513
High_spent_Medium_value_payments    17540
Low_spent_Medium_value_payments     13861
High_spent_Large_value_payments     13721
High_spent_Small_value_payments     11340
Low_spent_Large_value_payments      10425
!@9#%8                               7600
Name: count, dtype: int64 


credit_score
Standard    53174
Poor        28998
Good        17828
Name: count, dtype: int64

[None, None, None, None, None]


# assuming _____ means no job was entered, hence unemployed situation
df['occupation'] = [x.replace('_______','Unemployed') for x in df['occupation']]

# just making it clearer
df['credit_mix'] = [x.replace('_','Unknown') for x in df['credit_mix']]

# assuming NM was typo for No
df['payment_of_min_amount'] = [x.replace('NM','No') for x in df['payment_of_min_amount']]

# what can I say? !@9#%8
df['payment_behaviour'] = [x.replace('!@9#%8','Unknown') for x in df['payment_behaviour']]


df.to_csv('train_clean_data.csv')

Credit scoring with machine learning¶

Importing libraries¶

checking high level structure of data¶

Data quality¶

num_of_delayed_payment¶

num_credit_inquiries¶

amount_invested_monthly¶

monthly_balance¶

Another look into our dataset¶

Non-numeric columns¶

Data is ready!¶

	occupation	credit_mix	payment_of_min_amount	payment_behaviour	credit_score
0	Scientist	Unknown	No	High_spent_Small_value_payments	Good
1	Scientist	Good	No	Low_spent_Large_value_payments	Good
2	Scientist	Good	No	Low_spent_Medium_value_payments	Good
3	Scientist	Good	No	Low_spent_Small_value_payments	Good
4	Scientist	Good	No	High_spent_Medium_value_payments	Good
...	...	...	...	...	...
99995	Mechanic	Unknown	No	High_spent_Large_value_payments	Poor
99996	Mechanic	Unknown	No	High_spent_Medium_value_payments	Poor
99997	Mechanic	Good	No	High_spent_Large_value_payments	Poor
99998	Mechanic	Good	No	Low_spent_Large_value_payments	Standard
99999	Mechanic	Good	No	Unknown	Poor