Joining all processed data

This notebook joins all processed data and then saves it in a file for subsequent modeling.

# Last amended: 24th October, 2020
# Myfolder: C:\Users\Administrator\OneDrive\Documents\home_credit_default_risk
# Objective: 
#           Solving Kaggle problem: Home Credit Default Risk
#           Joining all processed datasets
#
# Data Source: https://www.kaggle.com/c/home-credit-default-risk/data
# Ref: https://www.kaggle.com/jsaguiar/lightgbm-with-simple-features
# 1.0 Libraries
#     (Some of these may not be needed here.)
%reset -f
import numpy as np
import pandas as pd
import gc

# 1.1 Reduce read data size
#     There is a file reducing.py
#      in this folder. A class
#       in it is used to reduce
#        dataframe size
#     (Code modified by me to
#      exclude 'category' dtype)
import reducing

# 1.2 Misc
import warnings
import os
warnings.simplefilter(action='ignore', category=FutureWarning)
# 1.3
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
# 1.4 Display multiple commands outputs from a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# 2.0 Prepare to read data
pathToData = "C:\\Users\\Administrator\\OneDrive\\Documents\\home_credit_default_risk"
os.chdir(pathToData)
# 2.1 Some constants
num_rows=None                # Implies read all rows
nan_as_category = True       # While transforming 
                             #   'object' columns to dummies
# 3.0 Read previous application data first
df = pd.read_csv(
                   'processed_df.csv.zip',
                   nrows = num_rows
                   )

# 3.0.1 Reduce memory usage by appropriately
#       changing data-types per feature:

df = reducing.Reducer().reduce(df)
reduced df from 712.1107 MB to 257.1889 MB in 8.61 seconds
# 3.1
df.shape    # (356251, 262)
df.head(2)
(356251, 262)
Unnamed: 0 index SK_ID_CURR TARGET CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE FLAG_EMAIL CNT_FAM_MEMBERS REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG ... ORGANIZATION_TYPE_Police ORGANIZATION_TYPE_Postal ORGANIZATION_TYPE_Realtor ORGANIZATION_TYPE_Religion ORGANIZATION_TYPE_Restaurant ORGANIZATION_TYPE_School ORGANIZATION_TYPE_Security ORGANIZATION_TYPE_Security Ministries ORGANIZATION_TYPE_Self-employed ORGANIZATION_TYPE_Services ORGANIZATION_TYPE_Telecom ORGANIZATION_TYPE_Trade: type 1 ORGANIZATION_TYPE_Trade: type 2 ORGANIZATION_TYPE_Trade: type 3 ORGANIZATION_TYPE_Trade: type 4 ORGANIZATION_TYPE_Trade: type 5 ORGANIZATION_TYPE_Trade: type 6 ORGANIZATION_TYPE_Trade: type 7 ORGANIZATION_TYPE_Transport: type 1 ORGANIZATION_TYPE_Transport: type 2 ORGANIZATION_TYPE_Transport: type 3 ORGANIZATION_TYPE_Transport: type 4 ORGANIZATION_TYPE_University ORGANIZATION_TYPE_XNA ORGANIZATION_TYPE_nan FONDKAPREMONT_MODE_not specified FONDKAPREMONT_MODE_org spec account FONDKAPREMONT_MODE_reg oper account FONDKAPREMONT_MODE_reg oper spec account FONDKAPREMONT_MODE_nan HOUSETYPE_MODE_block of flats HOUSETYPE_MODE_specific housing HOUSETYPE_MODE_terraced house HOUSETYPE_MODE_nan WALLSMATERIAL_MODE_Block WALLSMATERIAL_MODE_Mixed WALLSMATERIAL_MODE_Monolithic WALLSMATERIAL_MODE_Others WALLSMATERIAL_MODE_Panel WALLSMATERIAL_MODE_Stone, brick WALLSMATERIAL_MODE_Wooden WALLSMATERIAL_MODE_nan EMERGENCYSTATE_MODE_No EMERGENCYSTATE_MODE_Yes EMERGENCYSTATE_MODE_nan DAYS_EMPLOYED_PERC INCOME_CREDIT_PERC INCOME_PER_PERSON ANNUITY_INCOME_PERC PAYMENT_RATE
0 0 0 100002 1.0 0 0 0 0 202500.0 406597.5 24700.5 351000.0 0.018801 -9461 -637.0 -3648.0 -2120 NaN 1 1 0 1 1 0 1.0 2 2 10 0 0 0 0 0 0 0.083037 0.262949 0.139376 0.0247 0.0369 0.9722 0.6192 0.0143 0.00 0.0690 0.0833 0.1250 0.0369 0.0202 0.0190 0.0000 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0.067329 0.498036 202500.0 0.121978 0.060749
1 1 1 100003 0.0 1 0 1 0 270000.0 1293502.5 35698.5 1129500.0 0.003541 -16765 -1188.0 -1186.0 -291 NaN 1 1 0 1 1 0 2.0 1 1 11 0 0 0 0 0 0 0.311267 0.622246 NaN 0.0959 0.0529 0.9851 0.7960 0.0605 0.08 0.0345 0.2917 0.3333 0.0130 0.0773 0.0549 0.0039 ... 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0.070862 0.208736 135000.0 0.132217 0.027598

2 rows × 262 columns

# 3.2
df.columns
df.drop(columns = ['Unnamed: 0', 'index'], inplace = True)
df.columns
Index(['Unnamed: 0', 'index', 'SK_ID_CURR', 'TARGET', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT',
       ...
       'WALLSMATERIAL_MODE_Wooden', 'WALLSMATERIAL_MODE_nan',
       'EMERGENCYSTATE_MODE_No', 'EMERGENCYSTATE_MODE_Yes',
       'EMERGENCYSTATE_MODE_nan', 'DAYS_EMPLOYED_PERC', 'INCOME_CREDIT_PERC',
       'INCOME_PER_PERSON', 'ANNUITY_INCOME_PERC', 'PAYMENT_RATE'],
      dtype='object', length=262)
Index(['SK_ID_CURR', 'TARGET', 'CODE_GENDER', 'FLAG_OWN_CAR',
       'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
       'AMT_ANNUITY', 'AMT_GOODS_PRICE',
       ...
       'WALLSMATERIAL_MODE_Wooden', 'WALLSMATERIAL_MODE_nan',
       'EMERGENCYSTATE_MODE_No', 'EMERGENCYSTATE_MODE_Yes',
       'EMERGENCYSTATE_MODE_nan', 'DAYS_EMPLOYED_PERC', 'INCOME_CREDIT_PERC',
       'INCOME_PER_PERSON', 'ANNUITY_INCOME_PERC', 'PAYMENT_RATE'],
      dtype='object', length=260)
# 3.3
df.head(2)
SK_ID_CURR TARGET CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE FLAG_EMAIL CNT_FAM_MEMBERS REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG APARTMENTS_MODE ... ORGANIZATION_TYPE_Police ORGANIZATION_TYPE_Postal ORGANIZATION_TYPE_Realtor ORGANIZATION_TYPE_Religion ORGANIZATION_TYPE_Restaurant ORGANIZATION_TYPE_School ORGANIZATION_TYPE_Security ORGANIZATION_TYPE_Security Ministries ORGANIZATION_TYPE_Self-employed ORGANIZATION_TYPE_Services ORGANIZATION_TYPE_Telecom ORGANIZATION_TYPE_Trade: type 1 ORGANIZATION_TYPE_Trade: type 2 ORGANIZATION_TYPE_Trade: type 3 ORGANIZATION_TYPE_Trade: type 4 ORGANIZATION_TYPE_Trade: type 5 ORGANIZATION_TYPE_Trade: type 6 ORGANIZATION_TYPE_Trade: type 7 ORGANIZATION_TYPE_Transport: type 1 ORGANIZATION_TYPE_Transport: type 2 ORGANIZATION_TYPE_Transport: type 3 ORGANIZATION_TYPE_Transport: type 4 ORGANIZATION_TYPE_University ORGANIZATION_TYPE_XNA ORGANIZATION_TYPE_nan FONDKAPREMONT_MODE_not specified FONDKAPREMONT_MODE_org spec account FONDKAPREMONT_MODE_reg oper account FONDKAPREMONT_MODE_reg oper spec account FONDKAPREMONT_MODE_nan HOUSETYPE_MODE_block of flats HOUSETYPE_MODE_specific housing HOUSETYPE_MODE_terraced house HOUSETYPE_MODE_nan WALLSMATERIAL_MODE_Block WALLSMATERIAL_MODE_Mixed WALLSMATERIAL_MODE_Monolithic WALLSMATERIAL_MODE_Others WALLSMATERIAL_MODE_Panel WALLSMATERIAL_MODE_Stone, brick WALLSMATERIAL_MODE_Wooden WALLSMATERIAL_MODE_nan EMERGENCYSTATE_MODE_No EMERGENCYSTATE_MODE_Yes EMERGENCYSTATE_MODE_nan DAYS_EMPLOYED_PERC INCOME_CREDIT_PERC INCOME_PER_PERSON ANNUITY_INCOME_PERC PAYMENT_RATE
0 100002 1.0 0 0 0 0 202500.0 406597.5 24700.5 351000.0 0.018801 -9461 -637.0 -3648.0 -2120 NaN 1 1 0 1 1 0 1.0 2 2 10 0 0 0 0 0 0 0.083037 0.262949 0.139376 0.0247 0.0369 0.9722 0.6192 0.0143 0.00 0.0690 0.0833 0.1250 0.0369 0.0202 0.0190 0.0000 0.0000 0.0252 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0.067329 0.498036 202500.0 0.121978 0.060749
1 100003 0.0 1 0 1 0 270000.0 1293502.5 35698.5 1129500.0 0.003541 -16765 -1188.0 -1186.0 -291 NaN 1 1 0 1 1 0 2.0 1 1 11 0 0 0 0 0 0 0.311267 0.622246 NaN 0.0959 0.0529 0.9851 0.7960 0.0605 0.08 0.0345 0.2917 0.3333 0.0130 0.0773 0.0549 0.0039 0.0098 0.0924 ... 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0.070862 0.208736 135000.0 0.132217 0.027598

2 rows × 260 columns

# 3.4 Set SK_ID_CURR as Index
df = df.set_index('SK_ID_CURR')
df.head(2)
df.shape    # (356251, 259)
TARGET CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE FLAG_EMAIL CNT_FAM_MEMBERS REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG APARTMENTS_MODE BASEMENTAREA_MODE ... ORGANIZATION_TYPE_Police ORGANIZATION_TYPE_Postal ORGANIZATION_TYPE_Realtor ORGANIZATION_TYPE_Religion ORGANIZATION_TYPE_Restaurant ORGANIZATION_TYPE_School ORGANIZATION_TYPE_Security ORGANIZATION_TYPE_Security Ministries ORGANIZATION_TYPE_Self-employed ORGANIZATION_TYPE_Services ORGANIZATION_TYPE_Telecom ORGANIZATION_TYPE_Trade: type 1 ORGANIZATION_TYPE_Trade: type 2 ORGANIZATION_TYPE_Trade: type 3 ORGANIZATION_TYPE_Trade: type 4 ORGANIZATION_TYPE_Trade: type 5 ORGANIZATION_TYPE_Trade: type 6 ORGANIZATION_TYPE_Trade: type 7 ORGANIZATION_TYPE_Transport: type 1 ORGANIZATION_TYPE_Transport: type 2 ORGANIZATION_TYPE_Transport: type 3 ORGANIZATION_TYPE_Transport: type 4 ORGANIZATION_TYPE_University ORGANIZATION_TYPE_XNA ORGANIZATION_TYPE_nan FONDKAPREMONT_MODE_not specified FONDKAPREMONT_MODE_org spec account FONDKAPREMONT_MODE_reg oper account FONDKAPREMONT_MODE_reg oper spec account FONDKAPREMONT_MODE_nan HOUSETYPE_MODE_block of flats HOUSETYPE_MODE_specific housing HOUSETYPE_MODE_terraced house HOUSETYPE_MODE_nan WALLSMATERIAL_MODE_Block WALLSMATERIAL_MODE_Mixed WALLSMATERIAL_MODE_Monolithic WALLSMATERIAL_MODE_Others WALLSMATERIAL_MODE_Panel WALLSMATERIAL_MODE_Stone, brick WALLSMATERIAL_MODE_Wooden WALLSMATERIAL_MODE_nan EMERGENCYSTATE_MODE_No EMERGENCYSTATE_MODE_Yes EMERGENCYSTATE_MODE_nan DAYS_EMPLOYED_PERC INCOME_CREDIT_PERC INCOME_PER_PERSON ANNUITY_INCOME_PERC PAYMENT_RATE
SK_ID_CURR
100002 1.0 0 0 0 0 202500.0 406597.5 24700.5 351000.0 0.018801 -9461 -637.0 -3648.0 -2120 NaN 1 1 0 1 1 0 1.0 2 2 10 0 0 0 0 0 0 0.083037 0.262949 0.139376 0.0247 0.0369 0.9722 0.6192 0.0143 0.00 0.0690 0.0833 0.1250 0.0369 0.0202 0.0190 0.0000 0.0000 0.0252 0.0383 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0.067329 0.498036 202500.0 0.121978 0.060749
100003 0.0 1 0 1 0 270000.0 1293502.5 35698.5 1129500.0 0.003541 -16765 -1188.0 -1186.0 -291 NaN 1 1 0 1 1 0 2.0 1 1 11 0 0 0 0 0 0 0.311267 0.622246 NaN 0.0959 0.0529 0.9851 0.7960 0.0605 0.08 0.0345 0.2917 0.3333 0.0130 0.0773 0.0549 0.0039 0.0098 0.0924 0.0538 ... 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0.070862 0.208736 135000.0 0.132217 0.027598

2 rows × 259 columns

(356251, 259)
# 4.0  Read bureau_agg
bureau_agg = pd.read_csv(
                   'processed_bureau_agg.csv.zip',
                   nrows = num_rows
                   )

# 4.0.1 Reduce memory usage by appropriately
#       changing data-types per feature:

bureau_agg = reducing.Reducer().reduce(bureau_agg)
reduced df from 272.9790 MB to 222.2329 MB in 1.10 seconds
# 4.1 Set index 
bureau_agg.head(2)
bureau_agg = bureau_agg.set_index("SK_ID_CURR")
bureau_agg.head(2)
bureau_agg.shape    # (305811, 116)
SK_ID_CURR BURO_DAYS_CREDIT_MIN BURO_DAYS_CREDIT_MAX BURO_DAYS_CREDIT_MEAN BURO_DAYS_CREDIT_VAR BURO_DAYS_CREDIT_ENDDATE_MIN BURO_DAYS_CREDIT_ENDDATE_MAX BURO_DAYS_CREDIT_ENDDATE_MEAN BURO_DAYS_CREDIT_UPDATE_MEAN BURO_CREDIT_DAY_OVERDUE_MAX BURO_CREDIT_DAY_OVERDUE_MEAN BURO_AMT_CREDIT_MAX_OVERDUE_MEAN BURO_AMT_CREDIT_SUM_MAX BURO_AMT_CREDIT_SUM_MEAN BURO_AMT_CREDIT_SUM_SUM BURO_AMT_CREDIT_SUM_DEBT_MAX BURO_AMT_CREDIT_SUM_DEBT_MEAN BURO_AMT_CREDIT_SUM_DEBT_SUM BURO_AMT_CREDIT_SUM_OVERDUE_MEAN BURO_AMT_CREDIT_SUM_LIMIT_MEAN BURO_AMT_CREDIT_SUM_LIMIT_SUM BURO_AMT_ANNUITY_MAX BURO_AMT_ANNUITY_MEAN BURO_CNT_CREDIT_PROLONG_SUM BURO_MONTHS_BALANCE_MIN_MIN BURO_MONTHS_BALANCE_MAX_MAX BURO_MONTHS_BALANCE_SIZE_MEAN BURO_MONTHS_BALANCE_SIZE_SUM BURO_CREDIT_ACTIVE_Active_MEAN BURO_CREDIT_ACTIVE_Bad debt_MEAN BURO_CREDIT_ACTIVE_Closed_MEAN BURO_CREDIT_ACTIVE_Sold_MEAN BURO_CREDIT_ACTIVE_nan_MEAN BURO_CREDIT_CURRENCY_currency 1_MEAN BURO_CREDIT_CURRENCY_currency 2_MEAN BURO_CREDIT_CURRENCY_currency 3_MEAN BURO_CREDIT_CURRENCY_currency 4_MEAN BURO_CREDIT_CURRENCY_nan_MEAN BURO_CREDIT_TYPE_Another type of loan_MEAN BURO_CREDIT_TYPE_Car loan_MEAN BURO_CREDIT_TYPE_Cash loan (non-earmarked)_MEAN BURO_CREDIT_TYPE_Consumer credit_MEAN BURO_CREDIT_TYPE_Credit card_MEAN BURO_CREDIT_TYPE_Interbank credit_MEAN BURO_CREDIT_TYPE_Loan for business development_MEAN BURO_CREDIT_TYPE_Loan for purchase of shares (margin lending)_MEAN BURO_CREDIT_TYPE_Loan for the purchase of equipment_MEAN BURO_CREDIT_TYPE_Loan for working capital replenishment_MEAN BURO_CREDIT_TYPE_Microloan_MEAN BURO_CREDIT_TYPE_Mobile operator loan_MEAN ... ACTIVE_DAYS_CREDIT_ENDDATE_MIN ACTIVE_DAYS_CREDIT_ENDDATE_MAX ACTIVE_DAYS_CREDIT_ENDDATE_MEAN ACTIVE_DAYS_CREDIT_UPDATE_MEAN ACTIVE_CREDIT_DAY_OVERDUE_MAX ACTIVE_CREDIT_DAY_OVERDUE_MEAN ACTIVE_AMT_CREDIT_MAX_OVERDUE_MEAN ACTIVE_AMT_CREDIT_SUM_MAX ACTIVE_AMT_CREDIT_SUM_MEAN ACTIVE_AMT_CREDIT_SUM_SUM ACTIVE_AMT_CREDIT_SUM_DEBT_MAX ACTIVE_AMT_CREDIT_SUM_DEBT_MEAN ACTIVE_AMT_CREDIT_SUM_DEBT_SUM ACTIVE_AMT_CREDIT_SUM_OVERDUE_MEAN ACTIVE_AMT_CREDIT_SUM_LIMIT_MEAN ACTIVE_AMT_CREDIT_SUM_LIMIT_SUM ACTIVE_AMT_ANNUITY_MAX ACTIVE_AMT_ANNUITY_MEAN ACTIVE_CNT_CREDIT_PROLONG_SUM ACTIVE_MONTHS_BALANCE_MIN_MIN ACTIVE_MONTHS_BALANCE_MAX_MAX ACTIVE_MONTHS_BALANCE_SIZE_MEAN ACTIVE_MONTHS_BALANCE_SIZE_SUM CLOSED_DAYS_CREDIT_MIN CLOSED_DAYS_CREDIT_MAX CLOSED_DAYS_CREDIT_MEAN CLOSED_DAYS_CREDIT_VAR CLOSED_DAYS_CREDIT_ENDDATE_MIN CLOSED_DAYS_CREDIT_ENDDATE_MAX CLOSED_DAYS_CREDIT_ENDDATE_MEAN CLOSED_DAYS_CREDIT_UPDATE_MEAN CLOSED_CREDIT_DAY_OVERDUE_MAX CLOSED_CREDIT_DAY_OVERDUE_MEAN CLOSED_AMT_CREDIT_MAX_OVERDUE_MEAN CLOSED_AMT_CREDIT_SUM_MAX CLOSED_AMT_CREDIT_SUM_MEAN CLOSED_AMT_CREDIT_SUM_SUM CLOSED_AMT_CREDIT_SUM_DEBT_MAX CLOSED_AMT_CREDIT_SUM_DEBT_MEAN CLOSED_AMT_CREDIT_SUM_DEBT_SUM CLOSED_AMT_CREDIT_SUM_OVERDUE_MEAN CLOSED_AMT_CREDIT_SUM_LIMIT_MEAN CLOSED_AMT_CREDIT_SUM_LIMIT_SUM CLOSED_AMT_ANNUITY_MAX CLOSED_AMT_ANNUITY_MEAN CLOSED_CNT_CREDIT_PROLONG_SUM CLOSED_MONTHS_BALANCE_MIN_MIN CLOSED_MONTHS_BALANCE_MAX_MAX CLOSED_MONTHS_BALANCE_SIZE_MEAN CLOSED_MONTHS_BALANCE_SIZE_SUM
0 100001 -1572 -49 -735.0 240043.666667 -1329.0 1778.0 82.428571 -93.14286 0 0.0 NaN 378000.0 207623.571429 1.453365e+06 373239.0 85240.928571 596686.5 0.0 0.00000 0.000000 10822.5 3545.357143 0 -51.0 0.0 24.571429 172.0 0.428571 0.0 0.571429 0.0 0 1.0 0.0 0.0 0.0 0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 411.0 1778.0 1030.333333 -10.666667 0.0 0.0 NaN 378000.0 294675.0000 884025.000 373239.0 198895.5 596686.5 0.0 0.0000 0.000 10822.5 8272.5 0.0 -18.0 0.0 10.666667 32.0 -1572.0 -857.0 -1054.25 119594.25 -1329.0 -179.0 -628.5 -155.000000 0.0 0.0 NaN 279720.0 142335.0 569340.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -51.0 0.0 35.0 140.0
1 100002 -1437 -103 -874.0 186150.000000 -1072.0 780.0 -349.000000 -499.87500 0 0.0 1681.029 450000.0 108131.945625 8.650556e+05 245781.0 49156.200000 245781.0 0.0 7997.14125 31988.564453 0.0 0.000000 0 -47.0 0.0 13.750000 110.0 0.250000 0.0 0.750000 0.0 0 1.0 0.0 0.0 0.0 0 0.0 0.0 0.0 0.5 0.5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 780.0 780.0 780.000000 -15.500000 0.0 0.0 40.5 450000.0 240994.2825 481988.565 245781.0 122890.5 245781.0 0.0 15994.2825 31988.565 0.0 0.0 0.0 -34.0 0.0 10.000000 20.0 -1437.0 -476.0 -974.50 123956.70 -1072.0 85.0 -574.8 -661.333333 0.0 0.0 2091.16125 135000.0 63844.5 383067.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -47.0 0.0 15.0 90.0

2 rows × 117 columns

BURO_DAYS_CREDIT_MIN BURO_DAYS_CREDIT_MAX BURO_DAYS_CREDIT_MEAN BURO_DAYS_CREDIT_VAR BURO_DAYS_CREDIT_ENDDATE_MIN BURO_DAYS_CREDIT_ENDDATE_MAX BURO_DAYS_CREDIT_ENDDATE_MEAN BURO_DAYS_CREDIT_UPDATE_MEAN BURO_CREDIT_DAY_OVERDUE_MAX BURO_CREDIT_DAY_OVERDUE_MEAN BURO_AMT_CREDIT_MAX_OVERDUE_MEAN BURO_AMT_CREDIT_SUM_MAX BURO_AMT_CREDIT_SUM_MEAN BURO_AMT_CREDIT_SUM_SUM BURO_AMT_CREDIT_SUM_DEBT_MAX BURO_AMT_CREDIT_SUM_DEBT_MEAN BURO_AMT_CREDIT_SUM_DEBT_SUM BURO_AMT_CREDIT_SUM_OVERDUE_MEAN BURO_AMT_CREDIT_SUM_LIMIT_MEAN BURO_AMT_CREDIT_SUM_LIMIT_SUM BURO_AMT_ANNUITY_MAX BURO_AMT_ANNUITY_MEAN BURO_CNT_CREDIT_PROLONG_SUM BURO_MONTHS_BALANCE_MIN_MIN BURO_MONTHS_BALANCE_MAX_MAX BURO_MONTHS_BALANCE_SIZE_MEAN BURO_MONTHS_BALANCE_SIZE_SUM BURO_CREDIT_ACTIVE_Active_MEAN BURO_CREDIT_ACTIVE_Bad debt_MEAN BURO_CREDIT_ACTIVE_Closed_MEAN BURO_CREDIT_ACTIVE_Sold_MEAN BURO_CREDIT_ACTIVE_nan_MEAN BURO_CREDIT_CURRENCY_currency 1_MEAN BURO_CREDIT_CURRENCY_currency 2_MEAN BURO_CREDIT_CURRENCY_currency 3_MEAN BURO_CREDIT_CURRENCY_currency 4_MEAN BURO_CREDIT_CURRENCY_nan_MEAN BURO_CREDIT_TYPE_Another type of loan_MEAN BURO_CREDIT_TYPE_Car loan_MEAN BURO_CREDIT_TYPE_Cash loan (non-earmarked)_MEAN BURO_CREDIT_TYPE_Consumer credit_MEAN BURO_CREDIT_TYPE_Credit card_MEAN BURO_CREDIT_TYPE_Interbank credit_MEAN BURO_CREDIT_TYPE_Loan for business development_MEAN BURO_CREDIT_TYPE_Loan for purchase of shares (margin lending)_MEAN BURO_CREDIT_TYPE_Loan for the purchase of equipment_MEAN BURO_CREDIT_TYPE_Loan for working capital replenishment_MEAN BURO_CREDIT_TYPE_Microloan_MEAN BURO_CREDIT_TYPE_Mobile operator loan_MEAN BURO_CREDIT_TYPE_Mortgage_MEAN ... ACTIVE_DAYS_CREDIT_ENDDATE_MIN ACTIVE_DAYS_CREDIT_ENDDATE_MAX ACTIVE_DAYS_CREDIT_ENDDATE_MEAN ACTIVE_DAYS_CREDIT_UPDATE_MEAN ACTIVE_CREDIT_DAY_OVERDUE_MAX ACTIVE_CREDIT_DAY_OVERDUE_MEAN ACTIVE_AMT_CREDIT_MAX_OVERDUE_MEAN ACTIVE_AMT_CREDIT_SUM_MAX ACTIVE_AMT_CREDIT_SUM_MEAN ACTIVE_AMT_CREDIT_SUM_SUM ACTIVE_AMT_CREDIT_SUM_DEBT_MAX ACTIVE_AMT_CREDIT_SUM_DEBT_MEAN ACTIVE_AMT_CREDIT_SUM_DEBT_SUM ACTIVE_AMT_CREDIT_SUM_OVERDUE_MEAN ACTIVE_AMT_CREDIT_SUM_LIMIT_MEAN ACTIVE_AMT_CREDIT_SUM_LIMIT_SUM ACTIVE_AMT_ANNUITY_MAX ACTIVE_AMT_ANNUITY_MEAN ACTIVE_CNT_CREDIT_PROLONG_SUM ACTIVE_MONTHS_BALANCE_MIN_MIN ACTIVE_MONTHS_BALANCE_MAX_MAX ACTIVE_MONTHS_BALANCE_SIZE_MEAN ACTIVE_MONTHS_BALANCE_SIZE_SUM CLOSED_DAYS_CREDIT_MIN CLOSED_DAYS_CREDIT_MAX CLOSED_DAYS_CREDIT_MEAN CLOSED_DAYS_CREDIT_VAR CLOSED_DAYS_CREDIT_ENDDATE_MIN CLOSED_DAYS_CREDIT_ENDDATE_MAX CLOSED_DAYS_CREDIT_ENDDATE_MEAN CLOSED_DAYS_CREDIT_UPDATE_MEAN CLOSED_CREDIT_DAY_OVERDUE_MAX CLOSED_CREDIT_DAY_OVERDUE_MEAN CLOSED_AMT_CREDIT_MAX_OVERDUE_MEAN CLOSED_AMT_CREDIT_SUM_MAX CLOSED_AMT_CREDIT_SUM_MEAN CLOSED_AMT_CREDIT_SUM_SUM CLOSED_AMT_CREDIT_SUM_DEBT_MAX CLOSED_AMT_CREDIT_SUM_DEBT_MEAN CLOSED_AMT_CREDIT_SUM_DEBT_SUM CLOSED_AMT_CREDIT_SUM_OVERDUE_MEAN CLOSED_AMT_CREDIT_SUM_LIMIT_MEAN CLOSED_AMT_CREDIT_SUM_LIMIT_SUM CLOSED_AMT_ANNUITY_MAX CLOSED_AMT_ANNUITY_MEAN CLOSED_CNT_CREDIT_PROLONG_SUM CLOSED_MONTHS_BALANCE_MIN_MIN CLOSED_MONTHS_BALANCE_MAX_MAX CLOSED_MONTHS_BALANCE_SIZE_MEAN CLOSED_MONTHS_BALANCE_SIZE_SUM
SK_ID_CURR
100001 -1572 -49 -735.0 240043.666667 -1329.0 1778.0 82.428571 -93.14286 0 0.0 NaN 378000.0 207623.571429 1.453365e+06 373239.0 85240.928571 596686.5 0.0 0.00000 0.000000 10822.5 3545.357143 0 -51.0 0.0 24.571429 172.0 0.428571 0.0 0.571429 0.0 0 1.0 0.0 0.0 0.0 0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 411.0 1778.0 1030.333333 -10.666667 0.0 0.0 NaN 378000.0 294675.0000 884025.000 373239.0 198895.5 596686.5 0.0 0.0000 0.000 10822.5 8272.5 0.0 -18.0 0.0 10.666667 32.0 -1572.0 -857.0 -1054.25 119594.25 -1329.0 -179.0 -628.5 -155.000000 0.0 0.0 NaN 279720.0 142335.0 569340.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -51.0 0.0 35.0 140.0
100002 -1437 -103 -874.0 186150.000000 -1072.0 780.0 -349.000000 -499.87500 0 0.0 1681.029 450000.0 108131.945625 8.650556e+05 245781.0 49156.200000 245781.0 0.0 7997.14125 31988.564453 0.0 0.000000 0 -47.0 0.0 13.750000 110.0 0.250000 0.0 0.750000 0.0 0 1.0 0.0 0.0 0.0 0 0.0 0.0 0.0 0.5 0.5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 780.0 780.0 780.000000 -15.500000 0.0 0.0 40.5 450000.0 240994.2825 481988.565 245781.0 122890.5 245781.0 0.0 15994.2825 31988.565 0.0 0.0 0.0 -34.0 0.0 10.000000 20.0 -1437.0 -476.0 -974.50 123956.70 -1072.0 85.0 -574.8 -661.333333 0.0 0.0 2091.16125 135000.0 63844.5 383067.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -47.0 0.0 15.0 90.0

2 rows × 116 columns

(305811, 116)
# 5.0 Join bureau_agg with df
df = df.join(
             bureau_agg,
             how='left',
             on='SK_ID_CURR'
            )
# 5.1
df.shape    # (356251, 375)
df.head(2)
(356251, 375)
TARGET CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE FLAG_EMAIL CNT_FAM_MEMBERS REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG APARTMENTS_MODE BASEMENTAREA_MODE ... ACTIVE_DAYS_CREDIT_ENDDATE_MIN ACTIVE_DAYS_CREDIT_ENDDATE_MAX ACTIVE_DAYS_CREDIT_ENDDATE_MEAN ACTIVE_DAYS_CREDIT_UPDATE_MEAN ACTIVE_CREDIT_DAY_OVERDUE_MAX ACTIVE_CREDIT_DAY_OVERDUE_MEAN ACTIVE_AMT_CREDIT_MAX_OVERDUE_MEAN ACTIVE_AMT_CREDIT_SUM_MAX ACTIVE_AMT_CREDIT_SUM_MEAN ACTIVE_AMT_CREDIT_SUM_SUM ACTIVE_AMT_CREDIT_SUM_DEBT_MAX ACTIVE_AMT_CREDIT_SUM_DEBT_MEAN ACTIVE_AMT_CREDIT_SUM_DEBT_SUM ACTIVE_AMT_CREDIT_SUM_OVERDUE_MEAN ACTIVE_AMT_CREDIT_SUM_LIMIT_MEAN ACTIVE_AMT_CREDIT_SUM_LIMIT_SUM ACTIVE_AMT_ANNUITY_MAX ACTIVE_AMT_ANNUITY_MEAN ACTIVE_CNT_CREDIT_PROLONG_SUM ACTIVE_MONTHS_BALANCE_MIN_MIN ACTIVE_MONTHS_BALANCE_MAX_MAX ACTIVE_MONTHS_BALANCE_SIZE_MEAN ACTIVE_MONTHS_BALANCE_SIZE_SUM CLOSED_DAYS_CREDIT_MIN CLOSED_DAYS_CREDIT_MAX CLOSED_DAYS_CREDIT_MEAN CLOSED_DAYS_CREDIT_VAR CLOSED_DAYS_CREDIT_ENDDATE_MIN CLOSED_DAYS_CREDIT_ENDDATE_MAX CLOSED_DAYS_CREDIT_ENDDATE_MEAN CLOSED_DAYS_CREDIT_UPDATE_MEAN CLOSED_CREDIT_DAY_OVERDUE_MAX CLOSED_CREDIT_DAY_OVERDUE_MEAN CLOSED_AMT_CREDIT_MAX_OVERDUE_MEAN CLOSED_AMT_CREDIT_SUM_MAX CLOSED_AMT_CREDIT_SUM_MEAN CLOSED_AMT_CREDIT_SUM_SUM CLOSED_AMT_CREDIT_SUM_DEBT_MAX CLOSED_AMT_CREDIT_SUM_DEBT_MEAN CLOSED_AMT_CREDIT_SUM_DEBT_SUM CLOSED_AMT_CREDIT_SUM_OVERDUE_MEAN CLOSED_AMT_CREDIT_SUM_LIMIT_MEAN CLOSED_AMT_CREDIT_SUM_LIMIT_SUM CLOSED_AMT_ANNUITY_MAX CLOSED_AMT_ANNUITY_MEAN CLOSED_CNT_CREDIT_PROLONG_SUM CLOSED_MONTHS_BALANCE_MIN_MIN CLOSED_MONTHS_BALANCE_MAX_MAX CLOSED_MONTHS_BALANCE_SIZE_MEAN CLOSED_MONTHS_BALANCE_SIZE_SUM
SK_ID_CURR
100002 1.0 0 0 0 0 202500.0 406597.5 24700.5 351000.0 0.018801 -9461 -637.0 -3648.0 -2120 NaN 1 1 0 1 1 0 1.0 2 2 10 0 0 0 0 0 0 0.083037 0.262949 0.139376 0.0247 0.0369 0.9722 0.6192 0.0143 0.00 0.0690 0.0833 0.1250 0.0369 0.0202 0.0190 0.0000 0.0000 0.0252 0.0383 ... 780.0 780.0 780.0 -15.5 0.0 0.0 40.5 450000.0 240994.2825 481988.565 245781.0 122890.5 245781.0 0.0 15994.2825 31988.565 0.0 0.0 0.0 -34.0 0.0 10.0 20.0 -1437.0 -476.0 -974.500000 123956.700000 -1072.0 85.0 -574.800000 -661.333333 0.0 0.0 2091.16125 135000.0 63844.5 383067.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -47.0 0.0 15.0 90.0
100003 0.0 1 0 1 0 270000.0 1293502.5 35698.5 1129500.0 0.003541 -16765 -1188.0 -1186.0 -291 NaN 1 1 0 1 1 0 2.0 1 1 11 0 0 0 0 0 0 0.311267 0.622246 NaN 0.0959 0.0529 0.9851 0.7960 0.0605 0.08 0.0345 0.2917 0.3333 0.0130 0.0773 0.0549 0.0039 0.0098 0.0924 0.0538 ... 1216.0 1216.0 1216.0 -43.0 0.0 0.0 0.0 810000.0 810000.0000 810000.000 0.0 0.0 0.0 0.0 810000.0000 810000.000 NaN NaN 0.0 NaN NaN NaN 0.0 -2586.0 -775.0 -1665.666667 820590.333333 -2434.0 -420.0 -1131.333333 -1073.666667 0.0 0.0 0.00000 112500.0 69133.5 207400.5 0.0 0.0 0.0 0.0 0.0 0.0 NaN NaN 0.0 NaN NaN NaN 0.0

2 rows × 375 columns

# 5.2 Read previous application data
prev_agg = pd.read_csv(
                   'processed_prev_agg.csv.zip',
                   nrows = num_rows
                   )

# 5.3 Reduce memory usage by appropriately
#       changing data-types per feature:

prev_agg = reducing.Reducer().reduce(prev_agg)
reduced df from 646.3186 MB to 405.8881 MB in 2.13 seconds
# 5.3 Set Index
prev_agg.shape    # (338857, 250)
prev_agg.head(2)
prev_agg = prev_agg.set_index("SK_ID_CURR")
prev_agg.head(2)
prev_agg.shape    # (338857, 250)
(338857, 250)
SK_ID_CURR PREV_AMT_ANNUITY_MIN PREV_AMT_ANNUITY_MAX PREV_AMT_ANNUITY_MEAN PREV_AMT_APPLICATION_MIN PREV_AMT_APPLICATION_MAX PREV_AMT_APPLICATION_MEAN PREV_AMT_CREDIT_MIN PREV_AMT_CREDIT_MAX PREV_AMT_CREDIT_MEAN PREV_APP_CREDIT_PERC_MIN PREV_APP_CREDIT_PERC_MAX PREV_APP_CREDIT_PERC_MEAN PREV_APP_CREDIT_PERC_VAR PREV_AMT_DOWN_PAYMENT_MIN PREV_AMT_DOWN_PAYMENT_MAX PREV_AMT_DOWN_PAYMENT_MEAN PREV_AMT_GOODS_PRICE_MIN PREV_AMT_GOODS_PRICE_MAX PREV_AMT_GOODS_PRICE_MEAN PREV_HOUR_APPR_PROCESS_START_MIN PREV_HOUR_APPR_PROCESS_START_MAX PREV_HOUR_APPR_PROCESS_START_MEAN PREV_RATE_DOWN_PAYMENT_MIN PREV_RATE_DOWN_PAYMENT_MAX PREV_RATE_DOWN_PAYMENT_MEAN PREV_DAYS_DECISION_MIN PREV_DAYS_DECISION_MAX PREV_DAYS_DECISION_MEAN PREV_CNT_PAYMENT_MEAN PREV_CNT_PAYMENT_SUM PREV_NAME_CONTRACT_TYPE_Cash loans_MEAN PREV_NAME_CONTRACT_TYPE_Consumer loans_MEAN PREV_NAME_CONTRACT_TYPE_Revolving loans_MEAN PREV_NAME_CONTRACT_TYPE_XNA_MEAN PREV_NAME_CONTRACT_TYPE_nan_MEAN PREV_WEEKDAY_APPR_PROCESS_START_FRIDAY_MEAN PREV_WEEKDAY_APPR_PROCESS_START_MONDAY_MEAN PREV_WEEKDAY_APPR_PROCESS_START_SATURDAY_MEAN PREV_WEEKDAY_APPR_PROCESS_START_SUNDAY_MEAN PREV_WEEKDAY_APPR_PROCESS_START_THURSDAY_MEAN PREV_WEEKDAY_APPR_PROCESS_START_TUESDAY_MEAN PREV_WEEKDAY_APPR_PROCESS_START_WEDNESDAY_MEAN PREV_WEEKDAY_APPR_PROCESS_START_nan_MEAN PREV_FLAG_LAST_APPL_PER_CONTRACT_N_MEAN PREV_FLAG_LAST_APPL_PER_CONTRACT_Y_MEAN PREV_FLAG_LAST_APPL_PER_CONTRACT_nan_MEAN PREV_NAME_CASH_LOAN_PURPOSE_Building a house or an annex_MEAN PREV_NAME_CASH_LOAN_PURPOSE_Business development_MEAN PREV_NAME_CASH_LOAN_PURPOSE_Buying a garage_MEAN ... APPROVED_APP_CREDIT_PERC_MAX APPROVED_APP_CREDIT_PERC_MEAN APPROVED_APP_CREDIT_PERC_VAR APPROVED_AMT_DOWN_PAYMENT_MIN APPROVED_AMT_DOWN_PAYMENT_MAX APPROVED_AMT_DOWN_PAYMENT_MEAN APPROVED_AMT_GOODS_PRICE_MIN APPROVED_AMT_GOODS_PRICE_MAX APPROVED_AMT_GOODS_PRICE_MEAN APPROVED_HOUR_APPR_PROCESS_START_MIN APPROVED_HOUR_APPR_PROCESS_START_MAX APPROVED_HOUR_APPR_PROCESS_START_MEAN APPROVED_RATE_DOWN_PAYMENT_MIN APPROVED_RATE_DOWN_PAYMENT_MAX APPROVED_RATE_DOWN_PAYMENT_MEAN APPROVED_DAYS_DECISION_MIN APPROVED_DAYS_DECISION_MAX APPROVED_DAYS_DECISION_MEAN APPROVED_CNT_PAYMENT_MEAN APPROVED_CNT_PAYMENT_SUM REFUSED_AMT_ANNUITY_MIN REFUSED_AMT_ANNUITY_MAX REFUSED_AMT_ANNUITY_MEAN REFUSED_AMT_APPLICATION_MIN REFUSED_AMT_APPLICATION_MAX REFUSED_AMT_APPLICATION_MEAN REFUSED_AMT_CREDIT_MIN REFUSED_AMT_CREDIT_MAX REFUSED_AMT_CREDIT_MEAN REFUSED_APP_CREDIT_PERC_MIN REFUSED_APP_CREDIT_PERC_MAX REFUSED_APP_CREDIT_PERC_MEAN REFUSED_APP_CREDIT_PERC_VAR REFUSED_AMT_DOWN_PAYMENT_MIN REFUSED_AMT_DOWN_PAYMENT_MAX REFUSED_AMT_DOWN_PAYMENT_MEAN REFUSED_AMT_GOODS_PRICE_MIN REFUSED_AMT_GOODS_PRICE_MAX REFUSED_AMT_GOODS_PRICE_MEAN REFUSED_HOUR_APPR_PROCESS_START_MIN REFUSED_HOUR_APPR_PROCESS_START_MAX REFUSED_HOUR_APPR_PROCESS_START_MEAN REFUSED_RATE_DOWN_PAYMENT_MIN REFUSED_RATE_DOWN_PAYMENT_MAX REFUSED_RATE_DOWN_PAYMENT_MEAN REFUSED_DAYS_DECISION_MIN REFUSED_DAYS_DECISION_MAX REFUSED_DAYS_DECISION_MEAN REFUSED_CNT_PAYMENT_MEAN REFUSED_CNT_PAYMENT_SUM
0 100001 3951.000 3951.000 3951.000 24835.5 24835.5 24835.5 23787.0 23787.0 23787.0 1.044079 1.044079 1.044079 NaN 2520.0 2520.0 2520.0 24835.5 24835.5 24835.5 13 13 13.0 0.104326 0.104326 0.104326 -1740 -1740 -1740.0 8.0 8.0 0.0 1.0 0.0 0.0 0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0.0 1.0 0 0.0 0.0 0.0 ... 1.044079 1.044079 NaN 2520.0 2520.0 2520.0 24835.5 24835.5 24835.5 13.0 13.0 13.0 0.104326 0.104326 0.104326 -1740.0 -1740.0 -1740.0 8.0 8.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 100002 9251.775 9251.775 9251.775 179055.0 179055.0 179055.0 179055.0 179055.0 179055.0 1.000000 1.000000 1.000000 NaN 0.0 0.0 0.0 179055.0 179055.0 179055.0 9 9 9.0 0.000000 0.000000 0.000000 -606 -606 -606.0 24.0 24.0 0.0 1.0 0.0 0.0 0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0 0.0 1.0 0 0.0 0.0 0.0 ... 1.000000 1.000000 NaN 0.0 0.0 0.0 179055.0 179055.0 179055.0 9.0 9.0 9.0 0.000000 0.000000 0.000000 -606.0 -606.0 -606.0 24.0 24.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

2 rows × 250 columns

PREV_AMT_ANNUITY_MIN PREV_AMT_ANNUITY_MAX PREV_AMT_ANNUITY_MEAN PREV_AMT_APPLICATION_MIN PREV_AMT_APPLICATION_MAX PREV_AMT_APPLICATION_MEAN PREV_AMT_CREDIT_MIN PREV_AMT_CREDIT_MAX PREV_AMT_CREDIT_MEAN PREV_APP_CREDIT_PERC_MIN PREV_APP_CREDIT_PERC_MAX PREV_APP_CREDIT_PERC_MEAN PREV_APP_CREDIT_PERC_VAR PREV_AMT_DOWN_PAYMENT_MIN PREV_AMT_DOWN_PAYMENT_MAX PREV_AMT_DOWN_PAYMENT_MEAN PREV_AMT_GOODS_PRICE_MIN PREV_AMT_GOODS_PRICE_MAX PREV_AMT_GOODS_PRICE_MEAN PREV_HOUR_APPR_PROCESS_START_MIN PREV_HOUR_APPR_PROCESS_START_MAX PREV_HOUR_APPR_PROCESS_START_MEAN PREV_RATE_DOWN_PAYMENT_MIN PREV_RATE_DOWN_PAYMENT_MAX PREV_RATE_DOWN_PAYMENT_MEAN PREV_DAYS_DECISION_MIN PREV_DAYS_DECISION_MAX PREV_DAYS_DECISION_MEAN PREV_CNT_PAYMENT_MEAN PREV_CNT_PAYMENT_SUM PREV_NAME_CONTRACT_TYPE_Cash loans_MEAN PREV_NAME_CONTRACT_TYPE_Consumer loans_MEAN PREV_NAME_CONTRACT_TYPE_Revolving loans_MEAN PREV_NAME_CONTRACT_TYPE_XNA_MEAN PREV_NAME_CONTRACT_TYPE_nan_MEAN PREV_WEEKDAY_APPR_PROCESS_START_FRIDAY_MEAN PREV_WEEKDAY_APPR_PROCESS_START_MONDAY_MEAN PREV_WEEKDAY_APPR_PROCESS_START_SATURDAY_MEAN PREV_WEEKDAY_APPR_PROCESS_START_SUNDAY_MEAN PREV_WEEKDAY_APPR_PROCESS_START_THURSDAY_MEAN PREV_WEEKDAY_APPR_PROCESS_START_TUESDAY_MEAN PREV_WEEKDAY_APPR_PROCESS_START_WEDNESDAY_MEAN PREV_WEEKDAY_APPR_PROCESS_START_nan_MEAN PREV_FLAG_LAST_APPL_PER_CONTRACT_N_MEAN PREV_FLAG_LAST_APPL_PER_CONTRACT_Y_MEAN PREV_FLAG_LAST_APPL_PER_CONTRACT_nan_MEAN PREV_NAME_CASH_LOAN_PURPOSE_Building a house or an annex_MEAN PREV_NAME_CASH_LOAN_PURPOSE_Business development_MEAN PREV_NAME_CASH_LOAN_PURPOSE_Buying a garage_MEAN PREV_NAME_CASH_LOAN_PURPOSE_Buying a holiday home / land_MEAN ... APPROVED_APP_CREDIT_PERC_MAX APPROVED_APP_CREDIT_PERC_MEAN APPROVED_APP_CREDIT_PERC_VAR APPROVED_AMT_DOWN_PAYMENT_MIN APPROVED_AMT_DOWN_PAYMENT_MAX APPROVED_AMT_DOWN_PAYMENT_MEAN APPROVED_AMT_GOODS_PRICE_MIN APPROVED_AMT_GOODS_PRICE_MAX APPROVED_AMT_GOODS_PRICE_MEAN APPROVED_HOUR_APPR_PROCESS_START_MIN APPROVED_HOUR_APPR_PROCESS_START_MAX APPROVED_HOUR_APPR_PROCESS_START_MEAN APPROVED_RATE_DOWN_PAYMENT_MIN APPROVED_RATE_DOWN_PAYMENT_MAX APPROVED_RATE_DOWN_PAYMENT_MEAN APPROVED_DAYS_DECISION_MIN APPROVED_DAYS_DECISION_MAX APPROVED_DAYS_DECISION_MEAN APPROVED_CNT_PAYMENT_MEAN APPROVED_CNT_PAYMENT_SUM REFUSED_AMT_ANNUITY_MIN REFUSED_AMT_ANNUITY_MAX REFUSED_AMT_ANNUITY_MEAN REFUSED_AMT_APPLICATION_MIN REFUSED_AMT_APPLICATION_MAX REFUSED_AMT_APPLICATION_MEAN REFUSED_AMT_CREDIT_MIN REFUSED_AMT_CREDIT_MAX REFUSED_AMT_CREDIT_MEAN REFUSED_APP_CREDIT_PERC_MIN REFUSED_APP_CREDIT_PERC_MAX REFUSED_APP_CREDIT_PERC_MEAN REFUSED_APP_CREDIT_PERC_VAR REFUSED_AMT_DOWN_PAYMENT_MIN REFUSED_AMT_DOWN_PAYMENT_MAX REFUSED_AMT_DOWN_PAYMENT_MEAN REFUSED_AMT_GOODS_PRICE_MIN REFUSED_AMT_GOODS_PRICE_MAX REFUSED_AMT_GOODS_PRICE_MEAN REFUSED_HOUR_APPR_PROCESS_START_MIN REFUSED_HOUR_APPR_PROCESS_START_MAX REFUSED_HOUR_APPR_PROCESS_START_MEAN REFUSED_RATE_DOWN_PAYMENT_MIN REFUSED_RATE_DOWN_PAYMENT_MAX REFUSED_RATE_DOWN_PAYMENT_MEAN REFUSED_DAYS_DECISION_MIN REFUSED_DAYS_DECISION_MAX REFUSED_DAYS_DECISION_MEAN REFUSED_CNT_PAYMENT_MEAN REFUSED_CNT_PAYMENT_SUM
SK_ID_CURR
100001 3951.000 3951.000 3951.000 24835.5 24835.5 24835.5 23787.0 23787.0 23787.0 1.044079 1.044079 1.044079 NaN 2520.0 2520.0 2520.0 24835.5 24835.5 24835.5 13 13 13.0 0.104326 0.104326 0.104326 -1740 -1740 -1740.0 8.0 8.0 0.0 1.0 0.0 0.0 0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0.0 1.0 0 0.0 0.0 0.0 0.0 ... 1.044079 1.044079 NaN 2520.0 2520.0 2520.0 24835.5 24835.5 24835.5 13.0 13.0 13.0 0.104326 0.104326 0.104326 -1740.0 -1740.0 -1740.0 8.0 8.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
100002 9251.775 9251.775 9251.775 179055.0 179055.0 179055.0 179055.0 179055.0 179055.0 1.000000 1.000000 1.000000 NaN 0.0 0.0 0.0 179055.0 179055.0 179055.0 9 9 9.0 0.000000 0.000000 0.000000 -606 -606 -606.0 24.0 24.0 0.0 1.0 0.0 0.0 0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0 0.0 1.0 0 0.0 0.0 0.0 0.0 ... 1.000000 1.000000 NaN 0.0 0.0 0.0 179055.0 179055.0 179055.0 9.0 9.0 9.0 0.000000 0.000000 0.000000 -606.0 -606.0 -606.0 24.0 24.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

2 rows × 249 columns

(338857, 249)
# 6.0 Join prev_agg with df
df = df.join(prev_agg, how='left', on='SK_ID_CURR')
df.shape    # (356251, 624)
df.head(2)
(356251, 624)
TARGET CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE FLAG_EMAIL CNT_FAM_MEMBERS REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG APARTMENTS_MODE BASEMENTAREA_MODE ... APPROVED_APP_CREDIT_PERC_MAX APPROVED_APP_CREDIT_PERC_MEAN APPROVED_APP_CREDIT_PERC_VAR APPROVED_AMT_DOWN_PAYMENT_MIN APPROVED_AMT_DOWN_PAYMENT_MAX APPROVED_AMT_DOWN_PAYMENT_MEAN APPROVED_AMT_GOODS_PRICE_MIN APPROVED_AMT_GOODS_PRICE_MAX APPROVED_AMT_GOODS_PRICE_MEAN APPROVED_HOUR_APPR_PROCESS_START_MIN APPROVED_HOUR_APPR_PROCESS_START_MAX APPROVED_HOUR_APPR_PROCESS_START_MEAN APPROVED_RATE_DOWN_PAYMENT_MIN APPROVED_RATE_DOWN_PAYMENT_MAX APPROVED_RATE_DOWN_PAYMENT_MEAN APPROVED_DAYS_DECISION_MIN APPROVED_DAYS_DECISION_MAX APPROVED_DAYS_DECISION_MEAN APPROVED_CNT_PAYMENT_MEAN APPROVED_CNT_PAYMENT_SUM REFUSED_AMT_ANNUITY_MIN REFUSED_AMT_ANNUITY_MAX REFUSED_AMT_ANNUITY_MEAN REFUSED_AMT_APPLICATION_MIN REFUSED_AMT_APPLICATION_MAX REFUSED_AMT_APPLICATION_MEAN REFUSED_AMT_CREDIT_MIN REFUSED_AMT_CREDIT_MAX REFUSED_AMT_CREDIT_MEAN REFUSED_APP_CREDIT_PERC_MIN REFUSED_APP_CREDIT_PERC_MAX REFUSED_APP_CREDIT_PERC_MEAN REFUSED_APP_CREDIT_PERC_VAR REFUSED_AMT_DOWN_PAYMENT_MIN REFUSED_AMT_DOWN_PAYMENT_MAX REFUSED_AMT_DOWN_PAYMENT_MEAN REFUSED_AMT_GOODS_PRICE_MIN REFUSED_AMT_GOODS_PRICE_MAX REFUSED_AMT_GOODS_PRICE_MEAN REFUSED_HOUR_APPR_PROCESS_START_MIN REFUSED_HOUR_APPR_PROCESS_START_MAX REFUSED_HOUR_APPR_PROCESS_START_MEAN REFUSED_RATE_DOWN_PAYMENT_MIN REFUSED_RATE_DOWN_PAYMENT_MAX REFUSED_RATE_DOWN_PAYMENT_MEAN REFUSED_DAYS_DECISION_MIN REFUSED_DAYS_DECISION_MAX REFUSED_DAYS_DECISION_MEAN REFUSED_CNT_PAYMENT_MEAN REFUSED_CNT_PAYMENT_SUM
SK_ID_CURR
100002 1.0 0 0 0 0 202500.0 406597.5 24700.5 351000.0 0.018801 -9461 -637.0 -3648.0 -2120 NaN 1 1 0 1 1 0 1.0 2 2 10 0 0 0 0 0 0 0.083037 0.262949 0.139376 0.0247 0.0369 0.9722 0.6192 0.0143 0.00 0.0690 0.0833 0.1250 0.0369 0.0202 0.0190 0.0000 0.0000 0.0252 0.0383 ... 1.000000 1.000000 NaN 0.0 0.0 0.0 179055.0 179055.0 179055.0 9.0 9.0 9.000000 0.0 0.000000 0.00000 -606.0 -606.0 -606.0 24.0 24.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
100003 0.0 1 0 1 0 270000.0 1293502.5 35698.5 1129500.0 0.003541 -16765 -1188.0 -1186.0 -291 NaN 1 1 0 1 1 0 2.0 1 1 11 0 0 0 0 0 0 0.311267 0.622246 NaN 0.0959 0.0529 0.9851 0.7960 0.0605 0.08 0.0345 0.2917 0.3333 0.0130 0.0773 0.0549 0.0039 0.0098 0.0924 0.0538 ... 1.011109 0.949329 0.005324 0.0 6885.0 3442.5 68809.5 900000.0 435436.5 12.0 17.0 14.666667 0.0 0.100061 0.05003 -2341.0 -746.0 -1305.0 10.0 30.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

2 rows × 624 columns

# 7.0 Read processed POS data
pos_agg = pd.read_csv(
                   'processed_pos_agg.csv.zip',
                   nrows = num_rows
                   )

# 7.0.1 Reduce memory usage by appropriately
#       changing data-types per feature:

pos_agg = reducing.Reducer().reduce(pos_agg)
reduced df from 48.8877 MB to 19.9411 MB in 0.22 seconds
# 7.1
pos_agg.shape    # (337252, 19)
pos_agg.head(2)
pos_agg = pos_agg.set_index("SK_ID_CURR")
pos_agg.head(2)
pos_agg.shape   # (337252, 18)
(337252, 19)
SK_ID_CURR POS_MONTHS_BALANCE_MAX POS_MONTHS_BALANCE_MEAN POS_MONTHS_BALANCE_SIZE POS_SK_DPD_MAX POS_SK_DPD_MEAN POS_SK_DPD_DEF_MAX POS_SK_DPD_DEF_MEAN POS_NAME_CONTRACT_STATUS_Active_MEAN POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN POS_NAME_CONTRACT_STATUS_Approved_MEAN POS_NAME_CONTRACT_STATUS_Canceled_MEAN POS_NAME_CONTRACT_STATUS_Completed_MEAN POS_NAME_CONTRACT_STATUS_Demand_MEAN POS_NAME_CONTRACT_STATUS_Returned to the store_MEAN POS_NAME_CONTRACT_STATUS_Signed_MEAN POS_NAME_CONTRACT_STATUS_XNA_MEAN POS_NAME_CONTRACT_STATUS_nan_MEAN POS_COUNT
0 100001 -53 -72.555557 9 7 0.777778 7 0.777778 0.777778 0.0 0.0 0.0 0.222222 0.0 0.0 0.0 0.0 0 9
1 100002 -1 -10.000000 19 0 0.000000 0 0.000000 1.000000 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0 19
POS_MONTHS_BALANCE_MAX POS_MONTHS_BALANCE_MEAN POS_MONTHS_BALANCE_SIZE POS_SK_DPD_MAX POS_SK_DPD_MEAN POS_SK_DPD_DEF_MAX POS_SK_DPD_DEF_MEAN POS_NAME_CONTRACT_STATUS_Active_MEAN POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN POS_NAME_CONTRACT_STATUS_Approved_MEAN POS_NAME_CONTRACT_STATUS_Canceled_MEAN POS_NAME_CONTRACT_STATUS_Completed_MEAN POS_NAME_CONTRACT_STATUS_Demand_MEAN POS_NAME_CONTRACT_STATUS_Returned to the store_MEAN POS_NAME_CONTRACT_STATUS_Signed_MEAN POS_NAME_CONTRACT_STATUS_XNA_MEAN POS_NAME_CONTRACT_STATUS_nan_MEAN POS_COUNT
SK_ID_CURR
100001 -53 -72.555557 9 7 0.777778 7 0.777778 0.777778 0.0 0.0 0.0 0.222222 0.0 0.0 0.0 0.0 0 9
100002 -1 -10.000000 19 0 0.000000 0 0.000000 1.000000 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0 19
(337252, 18)
# 7.2 Join POS with df
df = df.join(
             pos_agg,
             how='left',
             on='SK_ID_CURR'
             )

df.shape    # (356251, 642)
df.head(2)
(356251, 642)
TARGET CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE FLAG_EMAIL CNT_FAM_MEMBERS REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG APARTMENTS_MODE BASEMENTAREA_MODE ... APPROVED_CNT_PAYMENT_MEAN APPROVED_CNT_PAYMENT_SUM REFUSED_AMT_ANNUITY_MIN REFUSED_AMT_ANNUITY_MAX REFUSED_AMT_ANNUITY_MEAN REFUSED_AMT_APPLICATION_MIN REFUSED_AMT_APPLICATION_MAX REFUSED_AMT_APPLICATION_MEAN REFUSED_AMT_CREDIT_MIN REFUSED_AMT_CREDIT_MAX REFUSED_AMT_CREDIT_MEAN REFUSED_APP_CREDIT_PERC_MIN REFUSED_APP_CREDIT_PERC_MAX REFUSED_APP_CREDIT_PERC_MEAN REFUSED_APP_CREDIT_PERC_VAR REFUSED_AMT_DOWN_PAYMENT_MIN REFUSED_AMT_DOWN_PAYMENT_MAX REFUSED_AMT_DOWN_PAYMENT_MEAN REFUSED_AMT_GOODS_PRICE_MIN REFUSED_AMT_GOODS_PRICE_MAX REFUSED_AMT_GOODS_PRICE_MEAN REFUSED_HOUR_APPR_PROCESS_START_MIN REFUSED_HOUR_APPR_PROCESS_START_MAX REFUSED_HOUR_APPR_PROCESS_START_MEAN REFUSED_RATE_DOWN_PAYMENT_MIN REFUSED_RATE_DOWN_PAYMENT_MAX REFUSED_RATE_DOWN_PAYMENT_MEAN REFUSED_DAYS_DECISION_MIN REFUSED_DAYS_DECISION_MAX REFUSED_DAYS_DECISION_MEAN REFUSED_CNT_PAYMENT_MEAN REFUSED_CNT_PAYMENT_SUM POS_MONTHS_BALANCE_MAX POS_MONTHS_BALANCE_MEAN POS_MONTHS_BALANCE_SIZE POS_SK_DPD_MAX POS_SK_DPD_MEAN POS_SK_DPD_DEF_MAX POS_SK_DPD_DEF_MEAN POS_NAME_CONTRACT_STATUS_Active_MEAN POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN POS_NAME_CONTRACT_STATUS_Approved_MEAN POS_NAME_CONTRACT_STATUS_Canceled_MEAN POS_NAME_CONTRACT_STATUS_Completed_MEAN POS_NAME_CONTRACT_STATUS_Demand_MEAN POS_NAME_CONTRACT_STATUS_Returned to the store_MEAN POS_NAME_CONTRACT_STATUS_Signed_MEAN POS_NAME_CONTRACT_STATUS_XNA_MEAN POS_NAME_CONTRACT_STATUS_nan_MEAN POS_COUNT
SK_ID_CURR
100002 1.0 0 0 0 0 202500.0 406597.5 24700.5 351000.0 0.018801 -9461 -637.0 -3648.0 -2120 NaN 1 1 0 1 1 0 1.0 2 2 10 0 0 0 0 0 0 0.083037 0.262949 0.139376 0.0247 0.0369 0.9722 0.6192 0.0143 0.00 0.0690 0.0833 0.1250 0.0369 0.0202 0.0190 0.0000 0.0000 0.0252 0.0383 ... 24.0 24.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN -1.0 -10.000000 19.0 0.0 0.0 0.0 0.0 1.000000 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 19.0
100003 0.0 1 0 1 0 270000.0 1293502.5 35698.5 1129500.0 0.003541 -16765 -1188.0 -1186.0 -291 NaN 1 1 0 1 1 0 2.0 1 1 11 0 0 0 0 0 0 0.311267 0.622246 NaN 0.0959 0.0529 0.9851 0.7960 0.0605 0.08 0.0345 0.2917 0.3333 0.0130 0.0773 0.0549 0.0039 0.0098 0.0924 0.0538 ... 10.0 30.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN -18.0 -43.785713 28.0 0.0 0.0 0.0 0.0 0.928571 0.0 0.0 0.0 0.071429 0.0 0.0 0.0 0.0 0.0 28.0

2 rows × 642 columns

# 8.0 Read processed installments data
ins_agg = pd.read_csv(
                   'processed_ins_agg.csv.zip',
                   nrows = num_rows
                   )

# 8.0.1 Reduce memory usage by appropriately
#       changing data-types per feature:

ins_agg = reducing.Reducer().reduce(ins_agg)
reduced df from 69.9529 MB to 46.3114 MB in 0.32 seconds
# 8.1 Set index
ins_agg.shape    # (339587, 26)
ins_agg.head(2)
ins_agg = ins_agg.set_index("SK_ID_CURR")
ins_agg.head(2)
ins_agg.shape   # (339587, 25)
(339587, 26)
SK_ID_CURR INSTAL_NUM_INSTALMENT_VERSION_NUNIQUE INSTAL_DPD_MAX INSTAL_DPD_MEAN INSTAL_DPD_SUM INSTAL_DBD_MAX INSTAL_DBD_MEAN INSTAL_DBD_SUM INSTAL_PAYMENT_PERC_MAX INSTAL_PAYMENT_PERC_MEAN INSTAL_PAYMENT_PERC_VAR INSTAL_PAYMENT_DIFF_MAX INSTAL_PAYMENT_DIFF_MEAN INSTAL_PAYMENT_DIFF_SUM INSTAL_PAYMENT_DIFF_VAR INSTAL_AMT_INSTALMENT_MAX INSTAL_AMT_INSTALMENT_MEAN INSTAL_AMT_INSTALMENT_SUM INSTAL_AMT_PAYMENT_MIN INSTAL_AMT_PAYMENT_MAX INSTAL_AMT_PAYMENT_MEAN INSTAL_AMT_PAYMENT_SUM INSTAL_DAYS_ENTRY_PAYMENT_MAX INSTAL_DAYS_ENTRY_PAYMENT_MEAN INSTAL_DAYS_ENTRY_PAYMENT_SUM INSTAL_COUNT
0 100001 2 11.0 1.571429 11.0 36.0 8.857142 62.0 1.0 1.0 7.564931e-17 0.000391 0.000077 0.000537 1.975468e-08 17397.900391 5885.132324 41195.925781 3951.000 17397.900 5885.132143 41195.925781 -1628.0 -2195.000000 -15365.0 7
1 100002 2 0.0 0.000000 0.0 31.0 20.421053 388.0 1.0 1.0 2.460418e-17 0.001094 0.000428 0.008125 2.602025e-08 53093.746094 11559.247070 219625.703125 9251.775 53093.745 11559.247105 219625.687500 -49.0 -315.421053 -5993.0 19
INSTAL_NUM_INSTALMENT_VERSION_NUNIQUE INSTAL_DPD_MAX INSTAL_DPD_MEAN INSTAL_DPD_SUM INSTAL_DBD_MAX INSTAL_DBD_MEAN INSTAL_DBD_SUM INSTAL_PAYMENT_PERC_MAX INSTAL_PAYMENT_PERC_MEAN INSTAL_PAYMENT_PERC_VAR INSTAL_PAYMENT_DIFF_MAX INSTAL_PAYMENT_DIFF_MEAN INSTAL_PAYMENT_DIFF_SUM INSTAL_PAYMENT_DIFF_VAR INSTAL_AMT_INSTALMENT_MAX INSTAL_AMT_INSTALMENT_MEAN INSTAL_AMT_INSTALMENT_SUM INSTAL_AMT_PAYMENT_MIN INSTAL_AMT_PAYMENT_MAX INSTAL_AMT_PAYMENT_MEAN INSTAL_AMT_PAYMENT_SUM INSTAL_DAYS_ENTRY_PAYMENT_MAX INSTAL_DAYS_ENTRY_PAYMENT_MEAN INSTAL_DAYS_ENTRY_PAYMENT_SUM INSTAL_COUNT
SK_ID_CURR
100001 2 11.0 1.571429 11.0 36.0 8.857142 62.0 1.0 1.0 7.564931e-17 0.000391 0.000077 0.000537 1.975468e-08 17397.900391 5885.132324 41195.925781 3951.000 17397.900 5885.132143 41195.925781 -1628.0 -2195.000000 -15365.0 7
100002 2 0.0 0.000000 0.0 31.0 20.421053 388.0 1.0 1.0 2.460418e-17 0.001094 0.000428 0.008125 2.602025e-08 53093.746094 11559.247070 219625.703125 9251.775 53093.745 11559.247105 219625.687500 -49.0 -315.421053 -5993.0 19
(339587, 25)
# 9.0 Join Installments data with df
df = df.join(ins_agg, how='left', on='SK_ID_CURR')
df.shape    # (356251, 667)
df.head(2)
(356251, 667)
TARGET CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE FLAG_EMAIL CNT_FAM_MEMBERS REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG APARTMENTS_MODE BASEMENTAREA_MODE ... REFUSED_RATE_DOWN_PAYMENT_MAX REFUSED_RATE_DOWN_PAYMENT_MEAN REFUSED_DAYS_DECISION_MIN REFUSED_DAYS_DECISION_MAX REFUSED_DAYS_DECISION_MEAN REFUSED_CNT_PAYMENT_MEAN REFUSED_CNT_PAYMENT_SUM POS_MONTHS_BALANCE_MAX POS_MONTHS_BALANCE_MEAN POS_MONTHS_BALANCE_SIZE POS_SK_DPD_MAX POS_SK_DPD_MEAN POS_SK_DPD_DEF_MAX POS_SK_DPD_DEF_MEAN POS_NAME_CONTRACT_STATUS_Active_MEAN POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN POS_NAME_CONTRACT_STATUS_Approved_MEAN POS_NAME_CONTRACT_STATUS_Canceled_MEAN POS_NAME_CONTRACT_STATUS_Completed_MEAN POS_NAME_CONTRACT_STATUS_Demand_MEAN POS_NAME_CONTRACT_STATUS_Returned to the store_MEAN POS_NAME_CONTRACT_STATUS_Signed_MEAN POS_NAME_CONTRACT_STATUS_XNA_MEAN POS_NAME_CONTRACT_STATUS_nan_MEAN POS_COUNT INSTAL_NUM_INSTALMENT_VERSION_NUNIQUE INSTAL_DPD_MAX INSTAL_DPD_MEAN INSTAL_DPD_SUM INSTAL_DBD_MAX INSTAL_DBD_MEAN INSTAL_DBD_SUM INSTAL_PAYMENT_PERC_MAX INSTAL_PAYMENT_PERC_MEAN INSTAL_PAYMENT_PERC_VAR INSTAL_PAYMENT_DIFF_MAX INSTAL_PAYMENT_DIFF_MEAN INSTAL_PAYMENT_DIFF_SUM INSTAL_PAYMENT_DIFF_VAR INSTAL_AMT_INSTALMENT_MAX INSTAL_AMT_INSTALMENT_MEAN INSTAL_AMT_INSTALMENT_SUM INSTAL_AMT_PAYMENT_MIN INSTAL_AMT_PAYMENT_MAX INSTAL_AMT_PAYMENT_MEAN INSTAL_AMT_PAYMENT_SUM INSTAL_DAYS_ENTRY_PAYMENT_MAX INSTAL_DAYS_ENTRY_PAYMENT_MEAN INSTAL_DAYS_ENTRY_PAYMENT_SUM INSTAL_COUNT
SK_ID_CURR
100002 1.0 0 0 0 0 202500.0 406597.5 24700.5 351000.0 0.018801 -9461 -637.0 -3648.0 -2120 NaN 1 1 0 1 1 0 1.0 2 2 10 0 0 0 0 0 0 0.083037 0.262949 0.139376 0.0247 0.0369 0.9722 0.6192 0.0143 0.00 0.0690 0.0833 0.1250 0.0369 0.0202 0.0190 0.0000 0.0000 0.0252 0.0383 ... NaN NaN NaN NaN NaN NaN NaN -1.0 -10.000000 19.0 0.0 0.0 0.0 0.0 1.000000 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 19.0 2.0 0.0 0.0 0.0 31.0 20.421053 388.0 1.0 1.0 2.460418e-17 0.001094 0.000428 0.008125 2.602025e-08 53093.746094 11559.247070 2.196257e+05 9251.775 53093.745 11559.247105 2.196257e+05 -49.0 -315.421053 -5993.0 19.0
100003 0.0 1 0 1 0 270000.0 1293502.5 35698.5 1129500.0 0.003541 -16765 -1188.0 -1186.0 -291 NaN 1 1 0 1 1 0 2.0 1 1 11 0 0 0 0 0 0 0.311267 0.622246 NaN 0.0959 0.0529 0.9851 0.7960 0.0605 0.08 0.0345 0.2917 0.3333 0.0130 0.0773 0.0549 0.0039 0.0098 0.0924 0.0538 ... NaN NaN NaN NaN NaN NaN NaN -18.0 -43.785713 28.0 0.0 0.0 0.0 0.0 0.928571 0.0 0.0 0.0 0.071429 0.0 0.0 0.0 0.0 0.0 28.0 2.0 0.0 0.0 0.0 14.0 7.160000 179.0 1.0 1.0 3.633997e-16 0.015000 -0.000297 -0.007422 1.157042e-05 560835.375000 64754.585938 1.618865e+06 6662.970 560835.360 64754.586000 1.618865e+06 -544.0 -1385.320000 -34633.0 25.0

2 rows × 667 columns

# 10.0 Read Credit card data
cc_agg = pd.read_csv(
                   'processed_creditCard_agg.csv.zip',
                   nrows = num_rows
                   )

# 10.0.1 Reduce memory usage by appropriately
#       changing data-types per feature:

cc_agg = reducing.Reducer().reduce(cc_agg)
reduced df from 112.1922 MB to 65.3796 MB in 0.40 seconds
# 10.1 Set Index
cc_agg.shape    # (103558, 142)
cc_agg.head(2)
cc_agg = cc_agg.set_index("SK_ID_CURR")
cc_agg.head(2)
cc_agg.shape   # (103558, 141)
(103558, 142)
SK_ID_CURR CC_MONTHS_BALANCE_MIN CC_MONTHS_BALANCE_MAX CC_MONTHS_BALANCE_MEAN CC_MONTHS_BALANCE_SUM CC_MONTHS_BALANCE_VAR CC_AMT_BALANCE_MIN CC_AMT_BALANCE_MAX CC_AMT_BALANCE_MEAN CC_AMT_BALANCE_SUM CC_AMT_BALANCE_VAR CC_AMT_CREDIT_LIMIT_ACTUAL_MIN CC_AMT_CREDIT_LIMIT_ACTUAL_MAX CC_AMT_CREDIT_LIMIT_ACTUAL_MEAN CC_AMT_CREDIT_LIMIT_ACTUAL_SUM CC_AMT_CREDIT_LIMIT_ACTUAL_VAR CC_AMT_DRAWINGS_ATM_CURRENT_MIN CC_AMT_DRAWINGS_ATM_CURRENT_MAX CC_AMT_DRAWINGS_ATM_CURRENT_MEAN CC_AMT_DRAWINGS_ATM_CURRENT_SUM CC_AMT_DRAWINGS_ATM_CURRENT_VAR CC_AMT_DRAWINGS_CURRENT_MIN CC_AMT_DRAWINGS_CURRENT_MAX CC_AMT_DRAWINGS_CURRENT_MEAN CC_AMT_DRAWINGS_CURRENT_SUM CC_AMT_DRAWINGS_CURRENT_VAR CC_AMT_DRAWINGS_OTHER_CURRENT_MIN CC_AMT_DRAWINGS_OTHER_CURRENT_MAX CC_AMT_DRAWINGS_OTHER_CURRENT_MEAN CC_AMT_DRAWINGS_OTHER_CURRENT_SUM CC_AMT_DRAWINGS_OTHER_CURRENT_VAR CC_AMT_DRAWINGS_POS_CURRENT_MIN CC_AMT_DRAWINGS_POS_CURRENT_MAX CC_AMT_DRAWINGS_POS_CURRENT_MEAN CC_AMT_DRAWINGS_POS_CURRENT_SUM CC_AMT_DRAWINGS_POS_CURRENT_VAR CC_AMT_INST_MIN_REGULARITY_MIN CC_AMT_INST_MIN_REGULARITY_MAX CC_AMT_INST_MIN_REGULARITY_MEAN CC_AMT_INST_MIN_REGULARITY_SUM CC_AMT_INST_MIN_REGULARITY_VAR CC_AMT_PAYMENT_CURRENT_MIN CC_AMT_PAYMENT_CURRENT_MAX CC_AMT_PAYMENT_CURRENT_MEAN CC_AMT_PAYMENT_CURRENT_SUM CC_AMT_PAYMENT_CURRENT_VAR CC_AMT_PAYMENT_TOTAL_CURRENT_MIN CC_AMT_PAYMENT_TOTAL_CURRENT_MAX CC_AMT_PAYMENT_TOTAL_CURRENT_MEAN CC_AMT_PAYMENT_TOTAL_CURRENT_SUM ... CC_SK_DPD_MAX CC_SK_DPD_MEAN CC_SK_DPD_SUM CC_SK_DPD_VAR CC_SK_DPD_DEF_MIN CC_SK_DPD_DEF_MAX CC_SK_DPD_DEF_MEAN CC_SK_DPD_DEF_SUM CC_SK_DPD_DEF_VAR CC_NAME_CONTRACT_STATUS_Active_MIN CC_NAME_CONTRACT_STATUS_Active_MAX CC_NAME_CONTRACT_STATUS_Active_MEAN CC_NAME_CONTRACT_STATUS_Active_SUM CC_NAME_CONTRACT_STATUS_Active_VAR CC_NAME_CONTRACT_STATUS_Approved_MIN CC_NAME_CONTRACT_STATUS_Approved_MAX CC_NAME_CONTRACT_STATUS_Approved_MEAN CC_NAME_CONTRACT_STATUS_Approved_SUM CC_NAME_CONTRACT_STATUS_Approved_VAR CC_NAME_CONTRACT_STATUS_Completed_MIN CC_NAME_CONTRACT_STATUS_Completed_MAX CC_NAME_CONTRACT_STATUS_Completed_MEAN CC_NAME_CONTRACT_STATUS_Completed_SUM CC_NAME_CONTRACT_STATUS_Completed_VAR CC_NAME_CONTRACT_STATUS_Demand_MIN CC_NAME_CONTRACT_STATUS_Demand_MAX CC_NAME_CONTRACT_STATUS_Demand_MEAN CC_NAME_CONTRACT_STATUS_Demand_SUM CC_NAME_CONTRACT_STATUS_Demand_VAR CC_NAME_CONTRACT_STATUS_Refused_MIN CC_NAME_CONTRACT_STATUS_Refused_MAX CC_NAME_CONTRACT_STATUS_Refused_MEAN CC_NAME_CONTRACT_STATUS_Refused_SUM CC_NAME_CONTRACT_STATUS_Refused_VAR CC_NAME_CONTRACT_STATUS_Sent proposal_MIN CC_NAME_CONTRACT_STATUS_Sent proposal_MAX CC_NAME_CONTRACT_STATUS_Sent proposal_MEAN CC_NAME_CONTRACT_STATUS_Sent proposal_SUM CC_NAME_CONTRACT_STATUS_Sent proposal_VAR CC_NAME_CONTRACT_STATUS_Signed_MIN CC_NAME_CONTRACT_STATUS_Signed_MAX CC_NAME_CONTRACT_STATUS_Signed_MEAN CC_NAME_CONTRACT_STATUS_Signed_SUM CC_NAME_CONTRACT_STATUS_Signed_VAR CC_NAME_CONTRACT_STATUS_nan_MIN CC_NAME_CONTRACT_STATUS_nan_MAX CC_NAME_CONTRACT_STATUS_nan_MEAN CC_NAME_CONTRACT_STATUS_nan_SUM CC_NAME_CONTRACT_STATUS_nan_VAR CC_COUNT
0 100006 -6 -1 -3.5 -21 3.5 0.0 0.0 0.000000 0.00 0.000000e+00 270000 270000 270000.0000 1620000 0.000000e+00 NaN NaN NaN 0.0 NaN 0.0 0.0 0.000000 0.0 0.000000e+00 NaN NaN NaN 0.0 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.000000 0.0000 0.000000e+00 NaN NaN NaN 0.00 NaN 0.0 0.0 0.000000 0.0 ... 0 0.0 0 0.0 0 0 0.0 0 0.0 1 1 1.0 6 0.0 0 0 0.0 0 0.0 0 0 0.0 0 0.0 0 0 0.0 0 0.0 0 0 0.0 0 0.0 0 0 0.0 0 0.0 0 0 0.0 0 0.0 0 0 0 0 0.0 6
1 100011 -75 -2 -38.5 -2849 462.5 0.0 189000.0 54482.109375 4031676.25 4.641321e+09 90000 180000 164189.1875 12150000 1.189060e+09 0.0 180000.0 2432.432432 180000.0 4.378378e+08 0.0 180000.0 2432.432373 180000.0 4.378378e+08 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9000.0 3956.221924 288804.1875 2.013991e+07 0.0 55485.0 4843.064189 358386.75 5.299260e+07 0.0 55485.0 4520.067383 334485.0 ... 0 0.0 0 0.0 0 0 0.0 0 0.0 1 1 1.0 74 0.0 0 0 0.0 0 0.0 0 0 0.0 0 0.0 0 0 0.0 0 0.0 0 0 0.0 0 0.0 0 0 0.0 0 0.0 0 0 0.0 0 0.0 0 0 0 0 0.0 74

2 rows × 142 columns

CC_MONTHS_BALANCE_MIN CC_MONTHS_BALANCE_MAX CC_MONTHS_BALANCE_MEAN CC_MONTHS_BALANCE_SUM CC_MONTHS_BALANCE_VAR CC_AMT_BALANCE_MIN CC_AMT_BALANCE_MAX CC_AMT_BALANCE_MEAN CC_AMT_BALANCE_SUM CC_AMT_BALANCE_VAR CC_AMT_CREDIT_LIMIT_ACTUAL_MIN CC_AMT_CREDIT_LIMIT_ACTUAL_MAX CC_AMT_CREDIT_LIMIT_ACTUAL_MEAN CC_AMT_CREDIT_LIMIT_ACTUAL_SUM CC_AMT_CREDIT_LIMIT_ACTUAL_VAR CC_AMT_DRAWINGS_ATM_CURRENT_MIN CC_AMT_DRAWINGS_ATM_CURRENT_MAX CC_AMT_DRAWINGS_ATM_CURRENT_MEAN CC_AMT_DRAWINGS_ATM_CURRENT_SUM CC_AMT_DRAWINGS_ATM_CURRENT_VAR CC_AMT_DRAWINGS_CURRENT_MIN CC_AMT_DRAWINGS_CURRENT_MAX CC_AMT_DRAWINGS_CURRENT_MEAN CC_AMT_DRAWINGS_CURRENT_SUM CC_AMT_DRAWINGS_CURRENT_VAR CC_AMT_DRAWINGS_OTHER_CURRENT_MIN CC_AMT_DRAWINGS_OTHER_CURRENT_MAX CC_AMT_DRAWINGS_OTHER_CURRENT_MEAN CC_AMT_DRAWINGS_OTHER_CURRENT_SUM CC_AMT_DRAWINGS_OTHER_CURRENT_VAR CC_AMT_DRAWINGS_POS_CURRENT_MIN CC_AMT_DRAWINGS_POS_CURRENT_MAX CC_AMT_DRAWINGS_POS_CURRENT_MEAN CC_AMT_DRAWINGS_POS_CURRENT_SUM CC_AMT_DRAWINGS_POS_CURRENT_VAR CC_AMT_INST_MIN_REGULARITY_MIN CC_AMT_INST_MIN_REGULARITY_MAX CC_AMT_INST_MIN_REGULARITY_MEAN CC_AMT_INST_MIN_REGULARITY_SUM CC_AMT_INST_MIN_REGULARITY_VAR CC_AMT_PAYMENT_CURRENT_MIN CC_AMT_PAYMENT_CURRENT_MAX CC_AMT_PAYMENT_CURRENT_MEAN CC_AMT_PAYMENT_CURRENT_SUM CC_AMT_PAYMENT_CURRENT_VAR CC_AMT_PAYMENT_TOTAL_CURRENT_MIN CC_AMT_PAYMENT_TOTAL_CURRENT_MAX CC_AMT_PAYMENT_TOTAL_CURRENT_MEAN CC_AMT_PAYMENT_TOTAL_CURRENT_SUM CC_AMT_PAYMENT_TOTAL_CURRENT_VAR ... CC_SK_DPD_MAX CC_SK_DPD_MEAN CC_SK_DPD_SUM CC_SK_DPD_VAR CC_SK_DPD_DEF_MIN CC_SK_DPD_DEF_MAX CC_SK_DPD_DEF_MEAN CC_SK_DPD_DEF_SUM CC_SK_DPD_DEF_VAR CC_NAME_CONTRACT_STATUS_Active_MIN CC_NAME_CONTRACT_STATUS_Active_MAX CC_NAME_CONTRACT_STATUS_Active_MEAN CC_NAME_CONTRACT_STATUS_Active_SUM CC_NAME_CONTRACT_STATUS_Active_VAR CC_NAME_CONTRACT_STATUS_Approved_MIN CC_NAME_CONTRACT_STATUS_Approved_MAX CC_NAME_CONTRACT_STATUS_Approved_MEAN CC_NAME_CONTRACT_STATUS_Approved_SUM CC_NAME_CONTRACT_STATUS_Approved_VAR CC_NAME_CONTRACT_STATUS_Completed_MIN CC_NAME_CONTRACT_STATUS_Completed_MAX CC_NAME_CONTRACT_STATUS_Completed_MEAN CC_NAME_CONTRACT_STATUS_Completed_SUM CC_NAME_CONTRACT_STATUS_Completed_VAR CC_NAME_CONTRACT_STATUS_Demand_MIN CC_NAME_CONTRACT_STATUS_Demand_MAX CC_NAME_CONTRACT_STATUS_Demand_MEAN CC_NAME_CONTRACT_STATUS_Demand_SUM CC_NAME_CONTRACT_STATUS_Demand_VAR CC_NAME_CONTRACT_STATUS_Refused_MIN CC_NAME_CONTRACT_STATUS_Refused_MAX CC_NAME_CONTRACT_STATUS_Refused_MEAN CC_NAME_CONTRACT_STATUS_Refused_SUM CC_NAME_CONTRACT_STATUS_Refused_VAR CC_NAME_CONTRACT_STATUS_Sent proposal_MIN CC_NAME_CONTRACT_STATUS_Sent proposal_MAX CC_NAME_CONTRACT_STATUS_Sent proposal_MEAN CC_NAME_CONTRACT_STATUS_Sent proposal_SUM CC_NAME_CONTRACT_STATUS_Sent proposal_VAR CC_NAME_CONTRACT_STATUS_Signed_MIN CC_NAME_CONTRACT_STATUS_Signed_MAX CC_NAME_CONTRACT_STATUS_Signed_MEAN CC_NAME_CONTRACT_STATUS_Signed_SUM CC_NAME_CONTRACT_STATUS_Signed_VAR CC_NAME_CONTRACT_STATUS_nan_MIN CC_NAME_CONTRACT_STATUS_nan_MAX CC_NAME_CONTRACT_STATUS_nan_MEAN CC_NAME_CONTRACT_STATUS_nan_SUM CC_NAME_CONTRACT_STATUS_nan_VAR CC_COUNT
SK_ID_CURR
100006 -6 -1 -3.5 -21 3.5 0.0 0.0 0.000000 0.00 0.000000e+00 270000 270000 270000.0000 1620000 0.000000e+00 NaN NaN NaN 0.0 NaN 0.0 0.0 0.000000 0.0 0.000000e+00 NaN NaN NaN 0.0 NaN NaN NaN NaN 0.0 NaN 0.0 0.0 0.000000 0.0000 0.000000e+00 NaN NaN NaN 0.00 NaN 0.0 0.0 0.000000 0.0 0.000000e+00 ... 0 0.0 0 0.0 0 0 0.0 0 0.0 1 1 1.0 6 0.0 0 0 0.0 0 0.0 0 0 0.0 0 0.0 0 0 0.0 0 0.0 0 0 0.0 0 0.0 0 0 0.0 0 0.0 0 0 0.0 0 0.0 0 0 0 0 0.0 6
100011 -75 -2 -38.5 -2849 462.5 0.0 189000.0 54482.109375 4031676.25 4.641321e+09 90000 180000 164189.1875 12150000 1.189060e+09 0.0 180000.0 2432.432432 180000.0 4.378378e+08 0.0 180000.0 2432.432373 180000.0 4.378378e+08 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9000.0 3956.221924 288804.1875 2.013991e+07 0.0 55485.0 4843.064189 358386.75 5.299260e+07 0.0 55485.0 4520.067383 334485.0 5.585877e+07 ... 0 0.0 0 0.0 0 0 0.0 0 0.0 1 1 1.0 74 0.0 0 0 0.0 0 0.0 0 0 0.0 0 0.0 0 0 0.0 0 0.0 0 0 0.0 0 0.0 0 0 0.0 0 0.0 0 0 0.0 0 0.0 0 0 0 0 0.0 74

2 rows × 141 columns

(103558, 141)
# 11. Join Credit card data with df
df = df.join(cc_agg, how='left', on='SK_ID_CURR')
df.shape    # (356251, 808)
df.head(2)
(356251, 808)
TARGET CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE FLAG_EMAIL CNT_FAM_MEMBERS REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG APARTMENTS_MODE BASEMENTAREA_MODE ... CC_SK_DPD_MAX CC_SK_DPD_MEAN CC_SK_DPD_SUM CC_SK_DPD_VAR CC_SK_DPD_DEF_MIN CC_SK_DPD_DEF_MAX CC_SK_DPD_DEF_MEAN CC_SK_DPD_DEF_SUM CC_SK_DPD_DEF_VAR CC_NAME_CONTRACT_STATUS_Active_MIN CC_NAME_CONTRACT_STATUS_Active_MAX CC_NAME_CONTRACT_STATUS_Active_MEAN CC_NAME_CONTRACT_STATUS_Active_SUM CC_NAME_CONTRACT_STATUS_Active_VAR CC_NAME_CONTRACT_STATUS_Approved_MIN CC_NAME_CONTRACT_STATUS_Approved_MAX CC_NAME_CONTRACT_STATUS_Approved_MEAN CC_NAME_CONTRACT_STATUS_Approved_SUM CC_NAME_CONTRACT_STATUS_Approved_VAR CC_NAME_CONTRACT_STATUS_Completed_MIN CC_NAME_CONTRACT_STATUS_Completed_MAX CC_NAME_CONTRACT_STATUS_Completed_MEAN CC_NAME_CONTRACT_STATUS_Completed_SUM CC_NAME_CONTRACT_STATUS_Completed_VAR CC_NAME_CONTRACT_STATUS_Demand_MIN CC_NAME_CONTRACT_STATUS_Demand_MAX CC_NAME_CONTRACT_STATUS_Demand_MEAN CC_NAME_CONTRACT_STATUS_Demand_SUM CC_NAME_CONTRACT_STATUS_Demand_VAR CC_NAME_CONTRACT_STATUS_Refused_MIN CC_NAME_CONTRACT_STATUS_Refused_MAX CC_NAME_CONTRACT_STATUS_Refused_MEAN CC_NAME_CONTRACT_STATUS_Refused_SUM CC_NAME_CONTRACT_STATUS_Refused_VAR CC_NAME_CONTRACT_STATUS_Sent proposal_MIN CC_NAME_CONTRACT_STATUS_Sent proposal_MAX CC_NAME_CONTRACT_STATUS_Sent proposal_MEAN CC_NAME_CONTRACT_STATUS_Sent proposal_SUM CC_NAME_CONTRACT_STATUS_Sent proposal_VAR CC_NAME_CONTRACT_STATUS_Signed_MIN CC_NAME_CONTRACT_STATUS_Signed_MAX CC_NAME_CONTRACT_STATUS_Signed_MEAN CC_NAME_CONTRACT_STATUS_Signed_SUM CC_NAME_CONTRACT_STATUS_Signed_VAR CC_NAME_CONTRACT_STATUS_nan_MIN CC_NAME_CONTRACT_STATUS_nan_MAX CC_NAME_CONTRACT_STATUS_nan_MEAN CC_NAME_CONTRACT_STATUS_nan_SUM CC_NAME_CONTRACT_STATUS_nan_VAR CC_COUNT
SK_ID_CURR
100002 1.0 0 0 0 0 202500.0 406597.5 24700.5 351000.0 0.018801 -9461 -637.0 -3648.0 -2120 NaN 1 1 0 1 1 0 1.0 2 2 10 0 0 0 0 0 0 0.083037 0.262949 0.139376 0.0247 0.0369 0.9722 0.6192 0.0143 0.00 0.0690 0.0833 0.1250 0.0369 0.0202 0.0190 0.0000 0.0000 0.0252 0.0383 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
100003 0.0 1 0 1 0 270000.0 1293502.5 35698.5 1129500.0 0.003541 -16765 -1188.0 -1186.0 -291 NaN 1 1 0 1 1 0 2.0 1 1 11 0 0 0 0 0 0 0.311267 0.622246 NaN 0.0959 0.0529 0.9851 0.7960 0.0605 0.08 0.0345 0.2917 0.3333 0.0130 0.0773 0.0549 0.0039 0.0098 0.0924 0.0538 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

2 rows × 808 columns

# 11.1 Save the results for subsequent use:
df.to_csv("processed_df_joined.csv.zip", compression = "zip")   
##################