Joining all processed data¶
This notebook joins all processed data and then saves it in a file for subsequent modeling.
# Last amended: 24th October, 2020
# Myfolder: C:\Users\Administrator\OneDrive\Documents\home_credit_default_risk
# Objective:
# Solving Kaggle problem: Home Credit Default Risk
# Joining all processed datasets
#
# Data Source: https://www.kaggle.com/c/home-credit-default-risk/data
# Ref: https://www.kaggle.com/jsaguiar/lightgbm-with-simple-features
# 1.0 Libraries
# (Some of these may not be needed here.)
%reset -f
import numpy as np
import pandas as pd
import gc
# 1.1 Reduce read data size
# There is a file reducing.py
# in this folder. A class
# in it is used to reduce
# dataframe size
# (Code modified by me to
# exclude 'category' dtype)
import reducing
# 1.2 Misc
import warnings
import os
warnings.simplefilter(action='ignore', category=FutureWarning)
# 1.3
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
# 1.4 Display multiple commands outputs from a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# 2.0 Prepare to read data
pathToData = "C:\\Users\\Administrator\\OneDrive\\Documents\\home_credit_default_risk"
os.chdir(pathToData)
# 2.1 Some constants
num_rows=None # Implies read all rows
nan_as_category = True # While transforming
# 'object' columns to dummies
# 3.0 Read previous application data first
df = pd.read_csv(
'processed_df.csv.zip',
nrows = num_rows
)
# 3.0.1 Reduce memory usage by appropriately
# changing data-types per feature:
df = reducing.Reducer().reduce(df)
reduced df from 712.1107 MB to 257.1889 MB in 8.61 seconds
# 3.1
df.shape # (356251, 262)
df.head(2)
(356251, 262)
Unnamed: 0 | index | SK_ID_CURR | TARGET | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | ... | ORGANIZATION_TYPE_Police | ORGANIZATION_TYPE_Postal | ORGANIZATION_TYPE_Realtor | ORGANIZATION_TYPE_Religion | ORGANIZATION_TYPE_Restaurant | ORGANIZATION_TYPE_School | ORGANIZATION_TYPE_Security | ORGANIZATION_TYPE_Security Ministries | ORGANIZATION_TYPE_Self-employed | ORGANIZATION_TYPE_Services | ORGANIZATION_TYPE_Telecom | ORGANIZATION_TYPE_Trade: type 1 | ORGANIZATION_TYPE_Trade: type 2 | ORGANIZATION_TYPE_Trade: type 3 | ORGANIZATION_TYPE_Trade: type 4 | ORGANIZATION_TYPE_Trade: type 5 | ORGANIZATION_TYPE_Trade: type 6 | ORGANIZATION_TYPE_Trade: type 7 | ORGANIZATION_TYPE_Transport: type 1 | ORGANIZATION_TYPE_Transport: type 2 | ORGANIZATION_TYPE_Transport: type 3 | ORGANIZATION_TYPE_Transport: type 4 | ORGANIZATION_TYPE_University | ORGANIZATION_TYPE_XNA | ORGANIZATION_TYPE_nan | FONDKAPREMONT_MODE_not specified | FONDKAPREMONT_MODE_org spec account | FONDKAPREMONT_MODE_reg oper account | FONDKAPREMONT_MODE_reg oper spec account | FONDKAPREMONT_MODE_nan | HOUSETYPE_MODE_block of flats | HOUSETYPE_MODE_specific housing | HOUSETYPE_MODE_terraced house | HOUSETYPE_MODE_nan | WALLSMATERIAL_MODE_Block | WALLSMATERIAL_MODE_Mixed | WALLSMATERIAL_MODE_Monolithic | WALLSMATERIAL_MODE_Others | WALLSMATERIAL_MODE_Panel | WALLSMATERIAL_MODE_Stone, brick | WALLSMATERIAL_MODE_Wooden | WALLSMATERIAL_MODE_nan | EMERGENCYSTATE_MODE_No | EMERGENCYSTATE_MODE_Yes | EMERGENCYSTATE_MODE_nan | DAYS_EMPLOYED_PERC | INCOME_CREDIT_PERC | INCOME_PER_PERSON | ANNUITY_INCOME_PERC | PAYMENT_RATE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 100002 | 1.0 | 0 | 0 | 0 | 0 | 202500.0 | 406597.5 | 24700.5 | 351000.0 | 0.018801 | -9461 | -637.0 | -3648.0 | -2120 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | 1.0 | 2 | 2 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0.083037 | 0.262949 | 0.139376 | 0.0247 | 0.0369 | 0.9722 | 0.6192 | 0.0143 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0369 | 0.0202 | 0.0190 | 0.0000 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0.067329 | 0.498036 | 202500.0 | 0.121978 | 0.060749 |
1 | 1 | 1 | 100003 | 0.0 | 1 | 0 | 1 | 0 | 270000.0 | 1293502.5 | 35698.5 | 1129500.0 | 0.003541 | -16765 | -1188.0 | -1186.0 | -291 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | 2.0 | 1 | 1 | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0.311267 | 0.622246 | NaN | 0.0959 | 0.0529 | 0.9851 | 0.7960 | 0.0605 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0130 | 0.0773 | 0.0549 | 0.0039 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0.070862 | 0.208736 | 135000.0 | 0.132217 | 0.027598 |
2 rows × 262 columns
# 3.2
df.columns
df.drop(columns = ['Unnamed: 0', 'index'], inplace = True)
df.columns
Index(['Unnamed: 0', 'index', 'SK_ID_CURR', 'TARGET', 'CODE_GENDER',
'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
'AMT_CREDIT',
...
'WALLSMATERIAL_MODE_Wooden', 'WALLSMATERIAL_MODE_nan',
'EMERGENCYSTATE_MODE_No', 'EMERGENCYSTATE_MODE_Yes',
'EMERGENCYSTATE_MODE_nan', 'DAYS_EMPLOYED_PERC', 'INCOME_CREDIT_PERC',
'INCOME_PER_PERSON', 'ANNUITY_INCOME_PERC', 'PAYMENT_RATE'],
dtype='object', length=262)
Index(['SK_ID_CURR', 'TARGET', 'CODE_GENDER', 'FLAG_OWN_CAR',
'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
'AMT_ANNUITY', 'AMT_GOODS_PRICE',
...
'WALLSMATERIAL_MODE_Wooden', 'WALLSMATERIAL_MODE_nan',
'EMERGENCYSTATE_MODE_No', 'EMERGENCYSTATE_MODE_Yes',
'EMERGENCYSTATE_MODE_nan', 'DAYS_EMPLOYED_PERC', 'INCOME_CREDIT_PERC',
'INCOME_PER_PERSON', 'ANNUITY_INCOME_PERC', 'PAYMENT_RATE'],
dtype='object', length=260)
# 3.3
df.head(2)
SK_ID_CURR | TARGET | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | ... | ORGANIZATION_TYPE_Police | ORGANIZATION_TYPE_Postal | ORGANIZATION_TYPE_Realtor | ORGANIZATION_TYPE_Religion | ORGANIZATION_TYPE_Restaurant | ORGANIZATION_TYPE_School | ORGANIZATION_TYPE_Security | ORGANIZATION_TYPE_Security Ministries | ORGANIZATION_TYPE_Self-employed | ORGANIZATION_TYPE_Services | ORGANIZATION_TYPE_Telecom | ORGANIZATION_TYPE_Trade: type 1 | ORGANIZATION_TYPE_Trade: type 2 | ORGANIZATION_TYPE_Trade: type 3 | ORGANIZATION_TYPE_Trade: type 4 | ORGANIZATION_TYPE_Trade: type 5 | ORGANIZATION_TYPE_Trade: type 6 | ORGANIZATION_TYPE_Trade: type 7 | ORGANIZATION_TYPE_Transport: type 1 | ORGANIZATION_TYPE_Transport: type 2 | ORGANIZATION_TYPE_Transport: type 3 | ORGANIZATION_TYPE_Transport: type 4 | ORGANIZATION_TYPE_University | ORGANIZATION_TYPE_XNA | ORGANIZATION_TYPE_nan | FONDKAPREMONT_MODE_not specified | FONDKAPREMONT_MODE_org spec account | FONDKAPREMONT_MODE_reg oper account | FONDKAPREMONT_MODE_reg oper spec account | FONDKAPREMONT_MODE_nan | HOUSETYPE_MODE_block of flats | HOUSETYPE_MODE_specific housing | HOUSETYPE_MODE_terraced house | HOUSETYPE_MODE_nan | WALLSMATERIAL_MODE_Block | WALLSMATERIAL_MODE_Mixed | WALLSMATERIAL_MODE_Monolithic | WALLSMATERIAL_MODE_Others | WALLSMATERIAL_MODE_Panel | WALLSMATERIAL_MODE_Stone, brick | WALLSMATERIAL_MODE_Wooden | WALLSMATERIAL_MODE_nan | EMERGENCYSTATE_MODE_No | EMERGENCYSTATE_MODE_Yes | EMERGENCYSTATE_MODE_nan | DAYS_EMPLOYED_PERC | INCOME_CREDIT_PERC | INCOME_PER_PERSON | ANNUITY_INCOME_PERC | PAYMENT_RATE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100002 | 1.0 | 0 | 0 | 0 | 0 | 202500.0 | 406597.5 | 24700.5 | 351000.0 | 0.018801 | -9461 | -637.0 | -3648.0 | -2120 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | 1.0 | 2 | 2 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0.083037 | 0.262949 | 0.139376 | 0.0247 | 0.0369 | 0.9722 | 0.6192 | 0.0143 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0369 | 0.0202 | 0.0190 | 0.0000 | 0.0000 | 0.0252 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0.067329 | 0.498036 | 202500.0 | 0.121978 | 0.060749 |
1 | 100003 | 0.0 | 1 | 0 | 1 | 0 | 270000.0 | 1293502.5 | 35698.5 | 1129500.0 | 0.003541 | -16765 | -1188.0 | -1186.0 | -291 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | 2.0 | 1 | 1 | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0.311267 | 0.622246 | NaN | 0.0959 | 0.0529 | 0.9851 | 0.7960 | 0.0605 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0130 | 0.0773 | 0.0549 | 0.0039 | 0.0098 | 0.0924 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0.070862 | 0.208736 | 135000.0 | 0.132217 | 0.027598 |
2 rows × 260 columns
# 3.4 Set SK_ID_CURR as Index
df = df.set_index('SK_ID_CURR')
df.head(2)
df.shape # (356251, 259)
TARGET | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | BASEMENTAREA_MODE | ... | ORGANIZATION_TYPE_Police | ORGANIZATION_TYPE_Postal | ORGANIZATION_TYPE_Realtor | ORGANIZATION_TYPE_Religion | ORGANIZATION_TYPE_Restaurant | ORGANIZATION_TYPE_School | ORGANIZATION_TYPE_Security | ORGANIZATION_TYPE_Security Ministries | ORGANIZATION_TYPE_Self-employed | ORGANIZATION_TYPE_Services | ORGANIZATION_TYPE_Telecom | ORGANIZATION_TYPE_Trade: type 1 | ORGANIZATION_TYPE_Trade: type 2 | ORGANIZATION_TYPE_Trade: type 3 | ORGANIZATION_TYPE_Trade: type 4 | ORGANIZATION_TYPE_Trade: type 5 | ORGANIZATION_TYPE_Trade: type 6 | ORGANIZATION_TYPE_Trade: type 7 | ORGANIZATION_TYPE_Transport: type 1 | ORGANIZATION_TYPE_Transport: type 2 | ORGANIZATION_TYPE_Transport: type 3 | ORGANIZATION_TYPE_Transport: type 4 | ORGANIZATION_TYPE_University | ORGANIZATION_TYPE_XNA | ORGANIZATION_TYPE_nan | FONDKAPREMONT_MODE_not specified | FONDKAPREMONT_MODE_org spec account | FONDKAPREMONT_MODE_reg oper account | FONDKAPREMONT_MODE_reg oper spec account | FONDKAPREMONT_MODE_nan | HOUSETYPE_MODE_block of flats | HOUSETYPE_MODE_specific housing | HOUSETYPE_MODE_terraced house | HOUSETYPE_MODE_nan | WALLSMATERIAL_MODE_Block | WALLSMATERIAL_MODE_Mixed | WALLSMATERIAL_MODE_Monolithic | WALLSMATERIAL_MODE_Others | WALLSMATERIAL_MODE_Panel | WALLSMATERIAL_MODE_Stone, brick | WALLSMATERIAL_MODE_Wooden | WALLSMATERIAL_MODE_nan | EMERGENCYSTATE_MODE_No | EMERGENCYSTATE_MODE_Yes | EMERGENCYSTATE_MODE_nan | DAYS_EMPLOYED_PERC | INCOME_CREDIT_PERC | INCOME_PER_PERSON | ANNUITY_INCOME_PERC | PAYMENT_RATE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
SK_ID_CURR | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
100002 | 1.0 | 0 | 0 | 0 | 0 | 202500.0 | 406597.5 | 24700.5 | 351000.0 | 0.018801 | -9461 | -637.0 | -3648.0 | -2120 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | 1.0 | 2 | 2 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0.083037 | 0.262949 | 0.139376 | 0.0247 | 0.0369 | 0.9722 | 0.6192 | 0.0143 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0369 | 0.0202 | 0.0190 | 0.0000 | 0.0000 | 0.0252 | 0.0383 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0.067329 | 0.498036 | 202500.0 | 0.121978 | 0.060749 |
100003 | 0.0 | 1 | 0 | 1 | 0 | 270000.0 | 1293502.5 | 35698.5 | 1129500.0 | 0.003541 | -16765 | -1188.0 | -1186.0 | -291 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | 2.0 | 1 | 1 | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0.311267 | 0.622246 | NaN | 0.0959 | 0.0529 | 0.9851 | 0.7960 | 0.0605 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0130 | 0.0773 | 0.0549 | 0.0039 | 0.0098 | 0.0924 | 0.0538 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0.070862 | 0.208736 | 135000.0 | 0.132217 | 0.027598 |
2 rows × 259 columns
(356251, 259)
# 4.0 Read bureau_agg
bureau_agg = pd.read_csv(
'processed_bureau_agg.csv.zip',
nrows = num_rows
)
# 4.0.1 Reduce memory usage by appropriately
# changing data-types per feature:
bureau_agg = reducing.Reducer().reduce(bureau_agg)
reduced df from 272.9790 MB to 222.2329 MB in 1.10 seconds
# 4.1 Set index
bureau_agg.head(2)
bureau_agg = bureau_agg.set_index("SK_ID_CURR")
bureau_agg.head(2)
bureau_agg.shape # (305811, 116)
SK_ID_CURR | BURO_DAYS_CREDIT_MIN | BURO_DAYS_CREDIT_MAX | BURO_DAYS_CREDIT_MEAN | BURO_DAYS_CREDIT_VAR | BURO_DAYS_CREDIT_ENDDATE_MIN | BURO_DAYS_CREDIT_ENDDATE_MAX | BURO_DAYS_CREDIT_ENDDATE_MEAN | BURO_DAYS_CREDIT_UPDATE_MEAN | BURO_CREDIT_DAY_OVERDUE_MAX | BURO_CREDIT_DAY_OVERDUE_MEAN | BURO_AMT_CREDIT_MAX_OVERDUE_MEAN | BURO_AMT_CREDIT_SUM_MAX | BURO_AMT_CREDIT_SUM_MEAN | BURO_AMT_CREDIT_SUM_SUM | BURO_AMT_CREDIT_SUM_DEBT_MAX | BURO_AMT_CREDIT_SUM_DEBT_MEAN | BURO_AMT_CREDIT_SUM_DEBT_SUM | BURO_AMT_CREDIT_SUM_OVERDUE_MEAN | BURO_AMT_CREDIT_SUM_LIMIT_MEAN | BURO_AMT_CREDIT_SUM_LIMIT_SUM | BURO_AMT_ANNUITY_MAX | BURO_AMT_ANNUITY_MEAN | BURO_CNT_CREDIT_PROLONG_SUM | BURO_MONTHS_BALANCE_MIN_MIN | BURO_MONTHS_BALANCE_MAX_MAX | BURO_MONTHS_BALANCE_SIZE_MEAN | BURO_MONTHS_BALANCE_SIZE_SUM | BURO_CREDIT_ACTIVE_Active_MEAN | BURO_CREDIT_ACTIVE_Bad debt_MEAN | BURO_CREDIT_ACTIVE_Closed_MEAN | BURO_CREDIT_ACTIVE_Sold_MEAN | BURO_CREDIT_ACTIVE_nan_MEAN | BURO_CREDIT_CURRENCY_currency 1_MEAN | BURO_CREDIT_CURRENCY_currency 2_MEAN | BURO_CREDIT_CURRENCY_currency 3_MEAN | BURO_CREDIT_CURRENCY_currency 4_MEAN | BURO_CREDIT_CURRENCY_nan_MEAN | BURO_CREDIT_TYPE_Another type of loan_MEAN | BURO_CREDIT_TYPE_Car loan_MEAN | BURO_CREDIT_TYPE_Cash loan (non-earmarked)_MEAN | BURO_CREDIT_TYPE_Consumer credit_MEAN | BURO_CREDIT_TYPE_Credit card_MEAN | BURO_CREDIT_TYPE_Interbank credit_MEAN | BURO_CREDIT_TYPE_Loan for business development_MEAN | BURO_CREDIT_TYPE_Loan for purchase of shares (margin lending)_MEAN | BURO_CREDIT_TYPE_Loan for the purchase of equipment_MEAN | BURO_CREDIT_TYPE_Loan for working capital replenishment_MEAN | BURO_CREDIT_TYPE_Microloan_MEAN | BURO_CREDIT_TYPE_Mobile operator loan_MEAN | ... | ACTIVE_DAYS_CREDIT_ENDDATE_MIN | ACTIVE_DAYS_CREDIT_ENDDATE_MAX | ACTIVE_DAYS_CREDIT_ENDDATE_MEAN | ACTIVE_DAYS_CREDIT_UPDATE_MEAN | ACTIVE_CREDIT_DAY_OVERDUE_MAX | ACTIVE_CREDIT_DAY_OVERDUE_MEAN | ACTIVE_AMT_CREDIT_MAX_OVERDUE_MEAN | ACTIVE_AMT_CREDIT_SUM_MAX | ACTIVE_AMT_CREDIT_SUM_MEAN | ACTIVE_AMT_CREDIT_SUM_SUM | ACTIVE_AMT_CREDIT_SUM_DEBT_MAX | ACTIVE_AMT_CREDIT_SUM_DEBT_MEAN | ACTIVE_AMT_CREDIT_SUM_DEBT_SUM | ACTIVE_AMT_CREDIT_SUM_OVERDUE_MEAN | ACTIVE_AMT_CREDIT_SUM_LIMIT_MEAN | ACTIVE_AMT_CREDIT_SUM_LIMIT_SUM | ACTIVE_AMT_ANNUITY_MAX | ACTIVE_AMT_ANNUITY_MEAN | ACTIVE_CNT_CREDIT_PROLONG_SUM | ACTIVE_MONTHS_BALANCE_MIN_MIN | ACTIVE_MONTHS_BALANCE_MAX_MAX | ACTIVE_MONTHS_BALANCE_SIZE_MEAN | ACTIVE_MONTHS_BALANCE_SIZE_SUM | CLOSED_DAYS_CREDIT_MIN | CLOSED_DAYS_CREDIT_MAX | CLOSED_DAYS_CREDIT_MEAN | CLOSED_DAYS_CREDIT_VAR | CLOSED_DAYS_CREDIT_ENDDATE_MIN | CLOSED_DAYS_CREDIT_ENDDATE_MAX | CLOSED_DAYS_CREDIT_ENDDATE_MEAN | CLOSED_DAYS_CREDIT_UPDATE_MEAN | CLOSED_CREDIT_DAY_OVERDUE_MAX | CLOSED_CREDIT_DAY_OVERDUE_MEAN | CLOSED_AMT_CREDIT_MAX_OVERDUE_MEAN | CLOSED_AMT_CREDIT_SUM_MAX | CLOSED_AMT_CREDIT_SUM_MEAN | CLOSED_AMT_CREDIT_SUM_SUM | CLOSED_AMT_CREDIT_SUM_DEBT_MAX | CLOSED_AMT_CREDIT_SUM_DEBT_MEAN | CLOSED_AMT_CREDIT_SUM_DEBT_SUM | CLOSED_AMT_CREDIT_SUM_OVERDUE_MEAN | CLOSED_AMT_CREDIT_SUM_LIMIT_MEAN | CLOSED_AMT_CREDIT_SUM_LIMIT_SUM | CLOSED_AMT_ANNUITY_MAX | CLOSED_AMT_ANNUITY_MEAN | CLOSED_CNT_CREDIT_PROLONG_SUM | CLOSED_MONTHS_BALANCE_MIN_MIN | CLOSED_MONTHS_BALANCE_MAX_MAX | CLOSED_MONTHS_BALANCE_SIZE_MEAN | CLOSED_MONTHS_BALANCE_SIZE_SUM | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100001 | -1572 | -49 | -735.0 | 240043.666667 | -1329.0 | 1778.0 | 82.428571 | -93.14286 | 0 | 0.0 | NaN | 378000.0 | 207623.571429 | 1.453365e+06 | 373239.0 | 85240.928571 | 596686.5 | 0.0 | 0.00000 | 0.000000 | 10822.5 | 3545.357143 | 0 | -51.0 | 0.0 | 24.571429 | 172.0 | 0.428571 | 0.0 | 0.571429 | 0.0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 411.0 | 1778.0 | 1030.333333 | -10.666667 | 0.0 | 0.0 | NaN | 378000.0 | 294675.0000 | 884025.000 | 373239.0 | 198895.5 | 596686.5 | 0.0 | 0.0000 | 0.000 | 10822.5 | 8272.5 | 0.0 | -18.0 | 0.0 | 10.666667 | 32.0 | -1572.0 | -857.0 | -1054.25 | 119594.25 | -1329.0 | -179.0 | -628.5 | -155.000000 | 0.0 | 0.0 | NaN | 279720.0 | 142335.0 | 569340.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -51.0 | 0.0 | 35.0 | 140.0 |
1 | 100002 | -1437 | -103 | -874.0 | 186150.000000 | -1072.0 | 780.0 | -349.000000 | -499.87500 | 0 | 0.0 | 1681.029 | 450000.0 | 108131.945625 | 8.650556e+05 | 245781.0 | 49156.200000 | 245781.0 | 0.0 | 7997.14125 | 31988.564453 | 0.0 | 0.000000 | 0 | -47.0 | 0.0 | 13.750000 | 110.0 | 0.250000 | 0.0 | 0.750000 | 0.0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0.0 | 0.5 | 0.5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 780.0 | 780.0 | 780.000000 | -15.500000 | 0.0 | 0.0 | 40.5 | 450000.0 | 240994.2825 | 481988.565 | 245781.0 | 122890.5 | 245781.0 | 0.0 | 15994.2825 | 31988.565 | 0.0 | 0.0 | 0.0 | -34.0 | 0.0 | 10.000000 | 20.0 | -1437.0 | -476.0 | -974.50 | 123956.70 | -1072.0 | 85.0 | -574.8 | -661.333333 | 0.0 | 0.0 | 2091.16125 | 135000.0 | 63844.5 | 383067.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -47.0 | 0.0 | 15.0 | 90.0 |
2 rows × 117 columns
BURO_DAYS_CREDIT_MIN | BURO_DAYS_CREDIT_MAX | BURO_DAYS_CREDIT_MEAN | BURO_DAYS_CREDIT_VAR | BURO_DAYS_CREDIT_ENDDATE_MIN | BURO_DAYS_CREDIT_ENDDATE_MAX | BURO_DAYS_CREDIT_ENDDATE_MEAN | BURO_DAYS_CREDIT_UPDATE_MEAN | BURO_CREDIT_DAY_OVERDUE_MAX | BURO_CREDIT_DAY_OVERDUE_MEAN | BURO_AMT_CREDIT_MAX_OVERDUE_MEAN | BURO_AMT_CREDIT_SUM_MAX | BURO_AMT_CREDIT_SUM_MEAN | BURO_AMT_CREDIT_SUM_SUM | BURO_AMT_CREDIT_SUM_DEBT_MAX | BURO_AMT_CREDIT_SUM_DEBT_MEAN | BURO_AMT_CREDIT_SUM_DEBT_SUM | BURO_AMT_CREDIT_SUM_OVERDUE_MEAN | BURO_AMT_CREDIT_SUM_LIMIT_MEAN | BURO_AMT_CREDIT_SUM_LIMIT_SUM | BURO_AMT_ANNUITY_MAX | BURO_AMT_ANNUITY_MEAN | BURO_CNT_CREDIT_PROLONG_SUM | BURO_MONTHS_BALANCE_MIN_MIN | BURO_MONTHS_BALANCE_MAX_MAX | BURO_MONTHS_BALANCE_SIZE_MEAN | BURO_MONTHS_BALANCE_SIZE_SUM | BURO_CREDIT_ACTIVE_Active_MEAN | BURO_CREDIT_ACTIVE_Bad debt_MEAN | BURO_CREDIT_ACTIVE_Closed_MEAN | BURO_CREDIT_ACTIVE_Sold_MEAN | BURO_CREDIT_ACTIVE_nan_MEAN | BURO_CREDIT_CURRENCY_currency 1_MEAN | BURO_CREDIT_CURRENCY_currency 2_MEAN | BURO_CREDIT_CURRENCY_currency 3_MEAN | BURO_CREDIT_CURRENCY_currency 4_MEAN | BURO_CREDIT_CURRENCY_nan_MEAN | BURO_CREDIT_TYPE_Another type of loan_MEAN | BURO_CREDIT_TYPE_Car loan_MEAN | BURO_CREDIT_TYPE_Cash loan (non-earmarked)_MEAN | BURO_CREDIT_TYPE_Consumer credit_MEAN | BURO_CREDIT_TYPE_Credit card_MEAN | BURO_CREDIT_TYPE_Interbank credit_MEAN | BURO_CREDIT_TYPE_Loan for business development_MEAN | BURO_CREDIT_TYPE_Loan for purchase of shares (margin lending)_MEAN | BURO_CREDIT_TYPE_Loan for the purchase of equipment_MEAN | BURO_CREDIT_TYPE_Loan for working capital replenishment_MEAN | BURO_CREDIT_TYPE_Microloan_MEAN | BURO_CREDIT_TYPE_Mobile operator loan_MEAN | BURO_CREDIT_TYPE_Mortgage_MEAN | ... | ACTIVE_DAYS_CREDIT_ENDDATE_MIN | ACTIVE_DAYS_CREDIT_ENDDATE_MAX | ACTIVE_DAYS_CREDIT_ENDDATE_MEAN | ACTIVE_DAYS_CREDIT_UPDATE_MEAN | ACTIVE_CREDIT_DAY_OVERDUE_MAX | ACTIVE_CREDIT_DAY_OVERDUE_MEAN | ACTIVE_AMT_CREDIT_MAX_OVERDUE_MEAN | ACTIVE_AMT_CREDIT_SUM_MAX | ACTIVE_AMT_CREDIT_SUM_MEAN | ACTIVE_AMT_CREDIT_SUM_SUM | ACTIVE_AMT_CREDIT_SUM_DEBT_MAX | ACTIVE_AMT_CREDIT_SUM_DEBT_MEAN | ACTIVE_AMT_CREDIT_SUM_DEBT_SUM | ACTIVE_AMT_CREDIT_SUM_OVERDUE_MEAN | ACTIVE_AMT_CREDIT_SUM_LIMIT_MEAN | ACTIVE_AMT_CREDIT_SUM_LIMIT_SUM | ACTIVE_AMT_ANNUITY_MAX | ACTIVE_AMT_ANNUITY_MEAN | ACTIVE_CNT_CREDIT_PROLONG_SUM | ACTIVE_MONTHS_BALANCE_MIN_MIN | ACTIVE_MONTHS_BALANCE_MAX_MAX | ACTIVE_MONTHS_BALANCE_SIZE_MEAN | ACTIVE_MONTHS_BALANCE_SIZE_SUM | CLOSED_DAYS_CREDIT_MIN | CLOSED_DAYS_CREDIT_MAX | CLOSED_DAYS_CREDIT_MEAN | CLOSED_DAYS_CREDIT_VAR | CLOSED_DAYS_CREDIT_ENDDATE_MIN | CLOSED_DAYS_CREDIT_ENDDATE_MAX | CLOSED_DAYS_CREDIT_ENDDATE_MEAN | CLOSED_DAYS_CREDIT_UPDATE_MEAN | CLOSED_CREDIT_DAY_OVERDUE_MAX | CLOSED_CREDIT_DAY_OVERDUE_MEAN | CLOSED_AMT_CREDIT_MAX_OVERDUE_MEAN | CLOSED_AMT_CREDIT_SUM_MAX | CLOSED_AMT_CREDIT_SUM_MEAN | CLOSED_AMT_CREDIT_SUM_SUM | CLOSED_AMT_CREDIT_SUM_DEBT_MAX | CLOSED_AMT_CREDIT_SUM_DEBT_MEAN | CLOSED_AMT_CREDIT_SUM_DEBT_SUM | CLOSED_AMT_CREDIT_SUM_OVERDUE_MEAN | CLOSED_AMT_CREDIT_SUM_LIMIT_MEAN | CLOSED_AMT_CREDIT_SUM_LIMIT_SUM | CLOSED_AMT_ANNUITY_MAX | CLOSED_AMT_ANNUITY_MEAN | CLOSED_CNT_CREDIT_PROLONG_SUM | CLOSED_MONTHS_BALANCE_MIN_MIN | CLOSED_MONTHS_BALANCE_MAX_MAX | CLOSED_MONTHS_BALANCE_SIZE_MEAN | CLOSED_MONTHS_BALANCE_SIZE_SUM | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
SK_ID_CURR | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
100001 | -1572 | -49 | -735.0 | 240043.666667 | -1329.0 | 1778.0 | 82.428571 | -93.14286 | 0 | 0.0 | NaN | 378000.0 | 207623.571429 | 1.453365e+06 | 373239.0 | 85240.928571 | 596686.5 | 0.0 | 0.00000 | 0.000000 | 10822.5 | 3545.357143 | 0 | -51.0 | 0.0 | 24.571429 | 172.0 | 0.428571 | 0.0 | 0.571429 | 0.0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 411.0 | 1778.0 | 1030.333333 | -10.666667 | 0.0 | 0.0 | NaN | 378000.0 | 294675.0000 | 884025.000 | 373239.0 | 198895.5 | 596686.5 | 0.0 | 0.0000 | 0.000 | 10822.5 | 8272.5 | 0.0 | -18.0 | 0.0 | 10.666667 | 32.0 | -1572.0 | -857.0 | -1054.25 | 119594.25 | -1329.0 | -179.0 | -628.5 | -155.000000 | 0.0 | 0.0 | NaN | 279720.0 | 142335.0 | 569340.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -51.0 | 0.0 | 35.0 | 140.0 |
100002 | -1437 | -103 | -874.0 | 186150.000000 | -1072.0 | 780.0 | -349.000000 | -499.87500 | 0 | 0.0 | 1681.029 | 450000.0 | 108131.945625 | 8.650556e+05 | 245781.0 | 49156.200000 | 245781.0 | 0.0 | 7997.14125 | 31988.564453 | 0.0 | 0.000000 | 0 | -47.0 | 0.0 | 13.750000 | 110.0 | 0.250000 | 0.0 | 0.750000 | 0.0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0.0 | 0.5 | 0.5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 780.0 | 780.0 | 780.000000 | -15.500000 | 0.0 | 0.0 | 40.5 | 450000.0 | 240994.2825 | 481988.565 | 245781.0 | 122890.5 | 245781.0 | 0.0 | 15994.2825 | 31988.565 | 0.0 | 0.0 | 0.0 | -34.0 | 0.0 | 10.000000 | 20.0 | -1437.0 | -476.0 | -974.50 | 123956.70 | -1072.0 | 85.0 | -574.8 | -661.333333 | 0.0 | 0.0 | 2091.16125 | 135000.0 | 63844.5 | 383067.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -47.0 | 0.0 | 15.0 | 90.0 |
2 rows × 116 columns
(305811, 116)
# 5.0 Join bureau_agg with df
df = df.join(
bureau_agg,
how='left',
on='SK_ID_CURR'
)
# 5.1
df.shape # (356251, 375)
df.head(2)
(356251, 375)
TARGET | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | BASEMENTAREA_MODE | ... | ACTIVE_DAYS_CREDIT_ENDDATE_MIN | ACTIVE_DAYS_CREDIT_ENDDATE_MAX | ACTIVE_DAYS_CREDIT_ENDDATE_MEAN | ACTIVE_DAYS_CREDIT_UPDATE_MEAN | ACTIVE_CREDIT_DAY_OVERDUE_MAX | ACTIVE_CREDIT_DAY_OVERDUE_MEAN | ACTIVE_AMT_CREDIT_MAX_OVERDUE_MEAN | ACTIVE_AMT_CREDIT_SUM_MAX | ACTIVE_AMT_CREDIT_SUM_MEAN | ACTIVE_AMT_CREDIT_SUM_SUM | ACTIVE_AMT_CREDIT_SUM_DEBT_MAX | ACTIVE_AMT_CREDIT_SUM_DEBT_MEAN | ACTIVE_AMT_CREDIT_SUM_DEBT_SUM | ACTIVE_AMT_CREDIT_SUM_OVERDUE_MEAN | ACTIVE_AMT_CREDIT_SUM_LIMIT_MEAN | ACTIVE_AMT_CREDIT_SUM_LIMIT_SUM | ACTIVE_AMT_ANNUITY_MAX | ACTIVE_AMT_ANNUITY_MEAN | ACTIVE_CNT_CREDIT_PROLONG_SUM | ACTIVE_MONTHS_BALANCE_MIN_MIN | ACTIVE_MONTHS_BALANCE_MAX_MAX | ACTIVE_MONTHS_BALANCE_SIZE_MEAN | ACTIVE_MONTHS_BALANCE_SIZE_SUM | CLOSED_DAYS_CREDIT_MIN | CLOSED_DAYS_CREDIT_MAX | CLOSED_DAYS_CREDIT_MEAN | CLOSED_DAYS_CREDIT_VAR | CLOSED_DAYS_CREDIT_ENDDATE_MIN | CLOSED_DAYS_CREDIT_ENDDATE_MAX | CLOSED_DAYS_CREDIT_ENDDATE_MEAN | CLOSED_DAYS_CREDIT_UPDATE_MEAN | CLOSED_CREDIT_DAY_OVERDUE_MAX | CLOSED_CREDIT_DAY_OVERDUE_MEAN | CLOSED_AMT_CREDIT_MAX_OVERDUE_MEAN | CLOSED_AMT_CREDIT_SUM_MAX | CLOSED_AMT_CREDIT_SUM_MEAN | CLOSED_AMT_CREDIT_SUM_SUM | CLOSED_AMT_CREDIT_SUM_DEBT_MAX | CLOSED_AMT_CREDIT_SUM_DEBT_MEAN | CLOSED_AMT_CREDIT_SUM_DEBT_SUM | CLOSED_AMT_CREDIT_SUM_OVERDUE_MEAN | CLOSED_AMT_CREDIT_SUM_LIMIT_MEAN | CLOSED_AMT_CREDIT_SUM_LIMIT_SUM | CLOSED_AMT_ANNUITY_MAX | CLOSED_AMT_ANNUITY_MEAN | CLOSED_CNT_CREDIT_PROLONG_SUM | CLOSED_MONTHS_BALANCE_MIN_MIN | CLOSED_MONTHS_BALANCE_MAX_MAX | CLOSED_MONTHS_BALANCE_SIZE_MEAN | CLOSED_MONTHS_BALANCE_SIZE_SUM | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
SK_ID_CURR | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
100002 | 1.0 | 0 | 0 | 0 | 0 | 202500.0 | 406597.5 | 24700.5 | 351000.0 | 0.018801 | -9461 | -637.0 | -3648.0 | -2120 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | 1.0 | 2 | 2 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0.083037 | 0.262949 | 0.139376 | 0.0247 | 0.0369 | 0.9722 | 0.6192 | 0.0143 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0369 | 0.0202 | 0.0190 | 0.0000 | 0.0000 | 0.0252 | 0.0383 | ... | 780.0 | 780.0 | 780.0 | -15.5 | 0.0 | 0.0 | 40.5 | 450000.0 | 240994.2825 | 481988.565 | 245781.0 | 122890.5 | 245781.0 | 0.0 | 15994.2825 | 31988.565 | 0.0 | 0.0 | 0.0 | -34.0 | 0.0 | 10.0 | 20.0 | -1437.0 | -476.0 | -974.500000 | 123956.700000 | -1072.0 | 85.0 | -574.800000 | -661.333333 | 0.0 | 0.0 | 2091.16125 | 135000.0 | 63844.5 | 383067.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -47.0 | 0.0 | 15.0 | 90.0 |
100003 | 0.0 | 1 | 0 | 1 | 0 | 270000.0 | 1293502.5 | 35698.5 | 1129500.0 | 0.003541 | -16765 | -1188.0 | -1186.0 | -291 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | 2.0 | 1 | 1 | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0.311267 | 0.622246 | NaN | 0.0959 | 0.0529 | 0.9851 | 0.7960 | 0.0605 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0130 | 0.0773 | 0.0549 | 0.0039 | 0.0098 | 0.0924 | 0.0538 | ... | 1216.0 | 1216.0 | 1216.0 | -43.0 | 0.0 | 0.0 | 0.0 | 810000.0 | 810000.0000 | 810000.000 | 0.0 | 0.0 | 0.0 | 0.0 | 810000.0000 | 810000.000 | NaN | NaN | 0.0 | NaN | NaN | NaN | 0.0 | -2586.0 | -775.0 | -1665.666667 | 820590.333333 | -2434.0 | -420.0 | -1131.333333 | -1073.666667 | 0.0 | 0.0 | 0.00000 | 112500.0 | 69133.5 | 207400.5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | 0.0 | NaN | NaN | NaN | 0.0 |
2 rows × 375 columns
# 5.2 Read previous application data
prev_agg = pd.read_csv(
'processed_prev_agg.csv.zip',
nrows = num_rows
)
# 5.3 Reduce memory usage by appropriately
# changing data-types per feature:
prev_agg = reducing.Reducer().reduce(prev_agg)
reduced df from 646.3186 MB to 405.8881 MB in 2.13 seconds
# 5.3 Set Index
prev_agg.shape # (338857, 250)
prev_agg.head(2)
prev_agg = prev_agg.set_index("SK_ID_CURR")
prev_agg.head(2)
prev_agg.shape # (338857, 250)
(338857, 250)
SK_ID_CURR | PREV_AMT_ANNUITY_MIN | PREV_AMT_ANNUITY_MAX | PREV_AMT_ANNUITY_MEAN | PREV_AMT_APPLICATION_MIN | PREV_AMT_APPLICATION_MAX | PREV_AMT_APPLICATION_MEAN | PREV_AMT_CREDIT_MIN | PREV_AMT_CREDIT_MAX | PREV_AMT_CREDIT_MEAN | PREV_APP_CREDIT_PERC_MIN | PREV_APP_CREDIT_PERC_MAX | PREV_APP_CREDIT_PERC_MEAN | PREV_APP_CREDIT_PERC_VAR | PREV_AMT_DOWN_PAYMENT_MIN | PREV_AMT_DOWN_PAYMENT_MAX | PREV_AMT_DOWN_PAYMENT_MEAN | PREV_AMT_GOODS_PRICE_MIN | PREV_AMT_GOODS_PRICE_MAX | PREV_AMT_GOODS_PRICE_MEAN | PREV_HOUR_APPR_PROCESS_START_MIN | PREV_HOUR_APPR_PROCESS_START_MAX | PREV_HOUR_APPR_PROCESS_START_MEAN | PREV_RATE_DOWN_PAYMENT_MIN | PREV_RATE_DOWN_PAYMENT_MAX | PREV_RATE_DOWN_PAYMENT_MEAN | PREV_DAYS_DECISION_MIN | PREV_DAYS_DECISION_MAX | PREV_DAYS_DECISION_MEAN | PREV_CNT_PAYMENT_MEAN | PREV_CNT_PAYMENT_SUM | PREV_NAME_CONTRACT_TYPE_Cash loans_MEAN | PREV_NAME_CONTRACT_TYPE_Consumer loans_MEAN | PREV_NAME_CONTRACT_TYPE_Revolving loans_MEAN | PREV_NAME_CONTRACT_TYPE_XNA_MEAN | PREV_NAME_CONTRACT_TYPE_nan_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_FRIDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_MONDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_SATURDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_SUNDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_THURSDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_TUESDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_WEDNESDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_nan_MEAN | PREV_FLAG_LAST_APPL_PER_CONTRACT_N_MEAN | PREV_FLAG_LAST_APPL_PER_CONTRACT_Y_MEAN | PREV_FLAG_LAST_APPL_PER_CONTRACT_nan_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Building a house or an annex_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Business development_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Buying a garage_MEAN | ... | APPROVED_APP_CREDIT_PERC_MAX | APPROVED_APP_CREDIT_PERC_MEAN | APPROVED_APP_CREDIT_PERC_VAR | APPROVED_AMT_DOWN_PAYMENT_MIN | APPROVED_AMT_DOWN_PAYMENT_MAX | APPROVED_AMT_DOWN_PAYMENT_MEAN | APPROVED_AMT_GOODS_PRICE_MIN | APPROVED_AMT_GOODS_PRICE_MAX | APPROVED_AMT_GOODS_PRICE_MEAN | APPROVED_HOUR_APPR_PROCESS_START_MIN | APPROVED_HOUR_APPR_PROCESS_START_MAX | APPROVED_HOUR_APPR_PROCESS_START_MEAN | APPROVED_RATE_DOWN_PAYMENT_MIN | APPROVED_RATE_DOWN_PAYMENT_MAX | APPROVED_RATE_DOWN_PAYMENT_MEAN | APPROVED_DAYS_DECISION_MIN | APPROVED_DAYS_DECISION_MAX | APPROVED_DAYS_DECISION_MEAN | APPROVED_CNT_PAYMENT_MEAN | APPROVED_CNT_PAYMENT_SUM | REFUSED_AMT_ANNUITY_MIN | REFUSED_AMT_ANNUITY_MAX | REFUSED_AMT_ANNUITY_MEAN | REFUSED_AMT_APPLICATION_MIN | REFUSED_AMT_APPLICATION_MAX | REFUSED_AMT_APPLICATION_MEAN | REFUSED_AMT_CREDIT_MIN | REFUSED_AMT_CREDIT_MAX | REFUSED_AMT_CREDIT_MEAN | REFUSED_APP_CREDIT_PERC_MIN | REFUSED_APP_CREDIT_PERC_MAX | REFUSED_APP_CREDIT_PERC_MEAN | REFUSED_APP_CREDIT_PERC_VAR | REFUSED_AMT_DOWN_PAYMENT_MIN | REFUSED_AMT_DOWN_PAYMENT_MAX | REFUSED_AMT_DOWN_PAYMENT_MEAN | REFUSED_AMT_GOODS_PRICE_MIN | REFUSED_AMT_GOODS_PRICE_MAX | REFUSED_AMT_GOODS_PRICE_MEAN | REFUSED_HOUR_APPR_PROCESS_START_MIN | REFUSED_HOUR_APPR_PROCESS_START_MAX | REFUSED_HOUR_APPR_PROCESS_START_MEAN | REFUSED_RATE_DOWN_PAYMENT_MIN | REFUSED_RATE_DOWN_PAYMENT_MAX | REFUSED_RATE_DOWN_PAYMENT_MEAN | REFUSED_DAYS_DECISION_MIN | REFUSED_DAYS_DECISION_MAX | REFUSED_DAYS_DECISION_MEAN | REFUSED_CNT_PAYMENT_MEAN | REFUSED_CNT_PAYMENT_SUM | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100001 | 3951.000 | 3951.000 | 3951.000 | 24835.5 | 24835.5 | 24835.5 | 23787.0 | 23787.0 | 23787.0 | 1.044079 | 1.044079 | 1.044079 | NaN | 2520.0 | 2520.0 | 2520.0 | 24835.5 | 24835.5 | 24835.5 | 13 | 13 | 13.0 | 0.104326 | 0.104326 | 0.104326 | -1740 | -1740 | -1740.0 | 8.0 | 8.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0.0 | 1.0 | 0 | 0.0 | 0.0 | 0.0 | ... | 1.044079 | 1.044079 | NaN | 2520.0 | 2520.0 | 2520.0 | 24835.5 | 24835.5 | 24835.5 | 13.0 | 13.0 | 13.0 | 0.104326 | 0.104326 | 0.104326 | -1740.0 | -1740.0 | -1740.0 | 8.0 | 8.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 100002 | 9251.775 | 9251.775 | 9251.775 | 179055.0 | 179055.0 | 179055.0 | 179055.0 | 179055.0 | 179055.0 | 1.000000 | 1.000000 | 1.000000 | NaN | 0.0 | 0.0 | 0.0 | 179055.0 | 179055.0 | 179055.0 | 9 | 9 | 9.0 | 0.000000 | 0.000000 | 0.000000 | -606 | -606 | -606.0 | 24.0 | 24.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0.0 | 1.0 | 0 | 0.0 | 0.0 | 0.0 | ... | 1.000000 | 1.000000 | NaN | 0.0 | 0.0 | 0.0 | 179055.0 | 179055.0 | 179055.0 | 9.0 | 9.0 | 9.0 | 0.000000 | 0.000000 | 0.000000 | -606.0 | -606.0 | -606.0 | 24.0 | 24.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 rows × 250 columns
PREV_AMT_ANNUITY_MIN | PREV_AMT_ANNUITY_MAX | PREV_AMT_ANNUITY_MEAN | PREV_AMT_APPLICATION_MIN | PREV_AMT_APPLICATION_MAX | PREV_AMT_APPLICATION_MEAN | PREV_AMT_CREDIT_MIN | PREV_AMT_CREDIT_MAX | PREV_AMT_CREDIT_MEAN | PREV_APP_CREDIT_PERC_MIN | PREV_APP_CREDIT_PERC_MAX | PREV_APP_CREDIT_PERC_MEAN | PREV_APP_CREDIT_PERC_VAR | PREV_AMT_DOWN_PAYMENT_MIN | PREV_AMT_DOWN_PAYMENT_MAX | PREV_AMT_DOWN_PAYMENT_MEAN | PREV_AMT_GOODS_PRICE_MIN | PREV_AMT_GOODS_PRICE_MAX | PREV_AMT_GOODS_PRICE_MEAN | PREV_HOUR_APPR_PROCESS_START_MIN | PREV_HOUR_APPR_PROCESS_START_MAX | PREV_HOUR_APPR_PROCESS_START_MEAN | PREV_RATE_DOWN_PAYMENT_MIN | PREV_RATE_DOWN_PAYMENT_MAX | PREV_RATE_DOWN_PAYMENT_MEAN | PREV_DAYS_DECISION_MIN | PREV_DAYS_DECISION_MAX | PREV_DAYS_DECISION_MEAN | PREV_CNT_PAYMENT_MEAN | PREV_CNT_PAYMENT_SUM | PREV_NAME_CONTRACT_TYPE_Cash loans_MEAN | PREV_NAME_CONTRACT_TYPE_Consumer loans_MEAN | PREV_NAME_CONTRACT_TYPE_Revolving loans_MEAN | PREV_NAME_CONTRACT_TYPE_XNA_MEAN | PREV_NAME_CONTRACT_TYPE_nan_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_FRIDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_MONDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_SATURDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_SUNDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_THURSDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_TUESDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_WEDNESDAY_MEAN | PREV_WEEKDAY_APPR_PROCESS_START_nan_MEAN | PREV_FLAG_LAST_APPL_PER_CONTRACT_N_MEAN | PREV_FLAG_LAST_APPL_PER_CONTRACT_Y_MEAN | PREV_FLAG_LAST_APPL_PER_CONTRACT_nan_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Building a house or an annex_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Business development_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Buying a garage_MEAN | PREV_NAME_CASH_LOAN_PURPOSE_Buying a holiday home / land_MEAN | ... | APPROVED_APP_CREDIT_PERC_MAX | APPROVED_APP_CREDIT_PERC_MEAN | APPROVED_APP_CREDIT_PERC_VAR | APPROVED_AMT_DOWN_PAYMENT_MIN | APPROVED_AMT_DOWN_PAYMENT_MAX | APPROVED_AMT_DOWN_PAYMENT_MEAN | APPROVED_AMT_GOODS_PRICE_MIN | APPROVED_AMT_GOODS_PRICE_MAX | APPROVED_AMT_GOODS_PRICE_MEAN | APPROVED_HOUR_APPR_PROCESS_START_MIN | APPROVED_HOUR_APPR_PROCESS_START_MAX | APPROVED_HOUR_APPR_PROCESS_START_MEAN | APPROVED_RATE_DOWN_PAYMENT_MIN | APPROVED_RATE_DOWN_PAYMENT_MAX | APPROVED_RATE_DOWN_PAYMENT_MEAN | APPROVED_DAYS_DECISION_MIN | APPROVED_DAYS_DECISION_MAX | APPROVED_DAYS_DECISION_MEAN | APPROVED_CNT_PAYMENT_MEAN | APPROVED_CNT_PAYMENT_SUM | REFUSED_AMT_ANNUITY_MIN | REFUSED_AMT_ANNUITY_MAX | REFUSED_AMT_ANNUITY_MEAN | REFUSED_AMT_APPLICATION_MIN | REFUSED_AMT_APPLICATION_MAX | REFUSED_AMT_APPLICATION_MEAN | REFUSED_AMT_CREDIT_MIN | REFUSED_AMT_CREDIT_MAX | REFUSED_AMT_CREDIT_MEAN | REFUSED_APP_CREDIT_PERC_MIN | REFUSED_APP_CREDIT_PERC_MAX | REFUSED_APP_CREDIT_PERC_MEAN | REFUSED_APP_CREDIT_PERC_VAR | REFUSED_AMT_DOWN_PAYMENT_MIN | REFUSED_AMT_DOWN_PAYMENT_MAX | REFUSED_AMT_DOWN_PAYMENT_MEAN | REFUSED_AMT_GOODS_PRICE_MIN | REFUSED_AMT_GOODS_PRICE_MAX | REFUSED_AMT_GOODS_PRICE_MEAN | REFUSED_HOUR_APPR_PROCESS_START_MIN | REFUSED_HOUR_APPR_PROCESS_START_MAX | REFUSED_HOUR_APPR_PROCESS_START_MEAN | REFUSED_RATE_DOWN_PAYMENT_MIN | REFUSED_RATE_DOWN_PAYMENT_MAX | REFUSED_RATE_DOWN_PAYMENT_MEAN | REFUSED_DAYS_DECISION_MIN | REFUSED_DAYS_DECISION_MAX | REFUSED_DAYS_DECISION_MEAN | REFUSED_CNT_PAYMENT_MEAN | REFUSED_CNT_PAYMENT_SUM | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
SK_ID_CURR | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
100001 | 3951.000 | 3951.000 | 3951.000 | 24835.5 | 24835.5 | 24835.5 | 23787.0 | 23787.0 | 23787.0 | 1.044079 | 1.044079 | 1.044079 | NaN | 2520.0 | 2520.0 | 2520.0 | 24835.5 | 24835.5 | 24835.5 | 13 | 13 | 13.0 | 0.104326 | 0.104326 | 0.104326 | -1740 | -1740 | -1740.0 | 8.0 | 8.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0.0 | 1.0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1.044079 | 1.044079 | NaN | 2520.0 | 2520.0 | 2520.0 | 24835.5 | 24835.5 | 24835.5 | 13.0 | 13.0 | 13.0 | 0.104326 | 0.104326 | 0.104326 | -1740.0 | -1740.0 | -1740.0 | 8.0 | 8.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
100002 | 9251.775 | 9251.775 | 9251.775 | 179055.0 | 179055.0 | 179055.0 | 179055.0 | 179055.0 | 179055.0 | 1.000000 | 1.000000 | 1.000000 | NaN | 0.0 | 0.0 | 0.0 | 179055.0 | 179055.0 | 179055.0 | 9 | 9 | 9.0 | 0.000000 | 0.000000 | 0.000000 | -606 | -606 | -606.0 | 24.0 | 24.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0.0 | 1.0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1.000000 | 1.000000 | NaN | 0.0 | 0.0 | 0.0 | 179055.0 | 179055.0 | 179055.0 | 9.0 | 9.0 | 9.0 | 0.000000 | 0.000000 | 0.000000 | -606.0 | -606.0 | -606.0 | 24.0 | 24.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 rows × 249 columns
(338857, 249)
# 6.0 Join prev_agg with df
df = df.join(prev_agg, how='left', on='SK_ID_CURR')
df.shape # (356251, 624)
df.head(2)
(356251, 624)
TARGET | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | BASEMENTAREA_MODE | ... | APPROVED_APP_CREDIT_PERC_MAX | APPROVED_APP_CREDIT_PERC_MEAN | APPROVED_APP_CREDIT_PERC_VAR | APPROVED_AMT_DOWN_PAYMENT_MIN | APPROVED_AMT_DOWN_PAYMENT_MAX | APPROVED_AMT_DOWN_PAYMENT_MEAN | APPROVED_AMT_GOODS_PRICE_MIN | APPROVED_AMT_GOODS_PRICE_MAX | APPROVED_AMT_GOODS_PRICE_MEAN | APPROVED_HOUR_APPR_PROCESS_START_MIN | APPROVED_HOUR_APPR_PROCESS_START_MAX | APPROVED_HOUR_APPR_PROCESS_START_MEAN | APPROVED_RATE_DOWN_PAYMENT_MIN | APPROVED_RATE_DOWN_PAYMENT_MAX | APPROVED_RATE_DOWN_PAYMENT_MEAN | APPROVED_DAYS_DECISION_MIN | APPROVED_DAYS_DECISION_MAX | APPROVED_DAYS_DECISION_MEAN | APPROVED_CNT_PAYMENT_MEAN | APPROVED_CNT_PAYMENT_SUM | REFUSED_AMT_ANNUITY_MIN | REFUSED_AMT_ANNUITY_MAX | REFUSED_AMT_ANNUITY_MEAN | REFUSED_AMT_APPLICATION_MIN | REFUSED_AMT_APPLICATION_MAX | REFUSED_AMT_APPLICATION_MEAN | REFUSED_AMT_CREDIT_MIN | REFUSED_AMT_CREDIT_MAX | REFUSED_AMT_CREDIT_MEAN | REFUSED_APP_CREDIT_PERC_MIN | REFUSED_APP_CREDIT_PERC_MAX | REFUSED_APP_CREDIT_PERC_MEAN | REFUSED_APP_CREDIT_PERC_VAR | REFUSED_AMT_DOWN_PAYMENT_MIN | REFUSED_AMT_DOWN_PAYMENT_MAX | REFUSED_AMT_DOWN_PAYMENT_MEAN | REFUSED_AMT_GOODS_PRICE_MIN | REFUSED_AMT_GOODS_PRICE_MAX | REFUSED_AMT_GOODS_PRICE_MEAN | REFUSED_HOUR_APPR_PROCESS_START_MIN | REFUSED_HOUR_APPR_PROCESS_START_MAX | REFUSED_HOUR_APPR_PROCESS_START_MEAN | REFUSED_RATE_DOWN_PAYMENT_MIN | REFUSED_RATE_DOWN_PAYMENT_MAX | REFUSED_RATE_DOWN_PAYMENT_MEAN | REFUSED_DAYS_DECISION_MIN | REFUSED_DAYS_DECISION_MAX | REFUSED_DAYS_DECISION_MEAN | REFUSED_CNT_PAYMENT_MEAN | REFUSED_CNT_PAYMENT_SUM | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
SK_ID_CURR | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
100002 | 1.0 | 0 | 0 | 0 | 0 | 202500.0 | 406597.5 | 24700.5 | 351000.0 | 0.018801 | -9461 | -637.0 | -3648.0 | -2120 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | 1.0 | 2 | 2 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0.083037 | 0.262949 | 0.139376 | 0.0247 | 0.0369 | 0.9722 | 0.6192 | 0.0143 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0369 | 0.0202 | 0.0190 | 0.0000 | 0.0000 | 0.0252 | 0.0383 | ... | 1.000000 | 1.000000 | NaN | 0.0 | 0.0 | 0.0 | 179055.0 | 179055.0 | 179055.0 | 9.0 | 9.0 | 9.000000 | 0.0 | 0.000000 | 0.00000 | -606.0 | -606.0 | -606.0 | 24.0 | 24.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
100003 | 0.0 | 1 | 0 | 1 | 0 | 270000.0 | 1293502.5 | 35698.5 | 1129500.0 | 0.003541 | -16765 | -1188.0 | -1186.0 | -291 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | 2.0 | 1 | 1 | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0.311267 | 0.622246 | NaN | 0.0959 | 0.0529 | 0.9851 | 0.7960 | 0.0605 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0130 | 0.0773 | 0.0549 | 0.0039 | 0.0098 | 0.0924 | 0.0538 | ... | 1.011109 | 0.949329 | 0.005324 | 0.0 | 6885.0 | 3442.5 | 68809.5 | 900000.0 | 435436.5 | 12.0 | 17.0 | 14.666667 | 0.0 | 0.100061 | 0.05003 | -2341.0 | -746.0 | -1305.0 | 10.0 | 30.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 rows × 624 columns
# 7.0 Read processed POS data
pos_agg = pd.read_csv(
'processed_pos_agg.csv.zip',
nrows = num_rows
)
# 7.0.1 Reduce memory usage by appropriately
# changing data-types per feature:
pos_agg = reducing.Reducer().reduce(pos_agg)
reduced df from 48.8877 MB to 19.9411 MB in 0.22 seconds
# 7.1
pos_agg.shape # (337252, 19)
pos_agg.head(2)
pos_agg = pos_agg.set_index("SK_ID_CURR")
pos_agg.head(2)
pos_agg.shape # (337252, 18)
(337252, 19)
SK_ID_CURR | POS_MONTHS_BALANCE_MAX | POS_MONTHS_BALANCE_MEAN | POS_MONTHS_BALANCE_SIZE | POS_SK_DPD_MAX | POS_SK_DPD_MEAN | POS_SK_DPD_DEF_MAX | POS_SK_DPD_DEF_MEAN | POS_NAME_CONTRACT_STATUS_Active_MEAN | POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN | POS_NAME_CONTRACT_STATUS_Approved_MEAN | POS_NAME_CONTRACT_STATUS_Canceled_MEAN | POS_NAME_CONTRACT_STATUS_Completed_MEAN | POS_NAME_CONTRACT_STATUS_Demand_MEAN | POS_NAME_CONTRACT_STATUS_Returned to the store_MEAN | POS_NAME_CONTRACT_STATUS_Signed_MEAN | POS_NAME_CONTRACT_STATUS_XNA_MEAN | POS_NAME_CONTRACT_STATUS_nan_MEAN | POS_COUNT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100001 | -53 | -72.555557 | 9 | 7 | 0.777778 | 7 | 0.777778 | 0.777778 | 0.0 | 0.0 | 0.0 | 0.222222 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 9 |
1 | 100002 | -1 | -10.000000 | 19 | 0 | 0.000000 | 0 | 0.000000 | 1.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 19 |
POS_MONTHS_BALANCE_MAX | POS_MONTHS_BALANCE_MEAN | POS_MONTHS_BALANCE_SIZE | POS_SK_DPD_MAX | POS_SK_DPD_MEAN | POS_SK_DPD_DEF_MAX | POS_SK_DPD_DEF_MEAN | POS_NAME_CONTRACT_STATUS_Active_MEAN | POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN | POS_NAME_CONTRACT_STATUS_Approved_MEAN | POS_NAME_CONTRACT_STATUS_Canceled_MEAN | POS_NAME_CONTRACT_STATUS_Completed_MEAN | POS_NAME_CONTRACT_STATUS_Demand_MEAN | POS_NAME_CONTRACT_STATUS_Returned to the store_MEAN | POS_NAME_CONTRACT_STATUS_Signed_MEAN | POS_NAME_CONTRACT_STATUS_XNA_MEAN | POS_NAME_CONTRACT_STATUS_nan_MEAN | POS_COUNT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
SK_ID_CURR | ||||||||||||||||||
100001 | -53 | -72.555557 | 9 | 7 | 0.777778 | 7 | 0.777778 | 0.777778 | 0.0 | 0.0 | 0.0 | 0.222222 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 9 |
100002 | -1 | -10.000000 | 19 | 0 | 0.000000 | 0 | 0.000000 | 1.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 19 |
(337252, 18)
# 7.2 Join POS with df
df = df.join(
pos_agg,
how='left',
on='SK_ID_CURR'
)
df.shape # (356251, 642)
df.head(2)
(356251, 642)
TARGET | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | BASEMENTAREA_MODE | ... | APPROVED_CNT_PAYMENT_MEAN | APPROVED_CNT_PAYMENT_SUM | REFUSED_AMT_ANNUITY_MIN | REFUSED_AMT_ANNUITY_MAX | REFUSED_AMT_ANNUITY_MEAN | REFUSED_AMT_APPLICATION_MIN | REFUSED_AMT_APPLICATION_MAX | REFUSED_AMT_APPLICATION_MEAN | REFUSED_AMT_CREDIT_MIN | REFUSED_AMT_CREDIT_MAX | REFUSED_AMT_CREDIT_MEAN | REFUSED_APP_CREDIT_PERC_MIN | REFUSED_APP_CREDIT_PERC_MAX | REFUSED_APP_CREDIT_PERC_MEAN | REFUSED_APP_CREDIT_PERC_VAR | REFUSED_AMT_DOWN_PAYMENT_MIN | REFUSED_AMT_DOWN_PAYMENT_MAX | REFUSED_AMT_DOWN_PAYMENT_MEAN | REFUSED_AMT_GOODS_PRICE_MIN | REFUSED_AMT_GOODS_PRICE_MAX | REFUSED_AMT_GOODS_PRICE_MEAN | REFUSED_HOUR_APPR_PROCESS_START_MIN | REFUSED_HOUR_APPR_PROCESS_START_MAX | REFUSED_HOUR_APPR_PROCESS_START_MEAN | REFUSED_RATE_DOWN_PAYMENT_MIN | REFUSED_RATE_DOWN_PAYMENT_MAX | REFUSED_RATE_DOWN_PAYMENT_MEAN | REFUSED_DAYS_DECISION_MIN | REFUSED_DAYS_DECISION_MAX | REFUSED_DAYS_DECISION_MEAN | REFUSED_CNT_PAYMENT_MEAN | REFUSED_CNT_PAYMENT_SUM | POS_MONTHS_BALANCE_MAX | POS_MONTHS_BALANCE_MEAN | POS_MONTHS_BALANCE_SIZE | POS_SK_DPD_MAX | POS_SK_DPD_MEAN | POS_SK_DPD_DEF_MAX | POS_SK_DPD_DEF_MEAN | POS_NAME_CONTRACT_STATUS_Active_MEAN | POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN | POS_NAME_CONTRACT_STATUS_Approved_MEAN | POS_NAME_CONTRACT_STATUS_Canceled_MEAN | POS_NAME_CONTRACT_STATUS_Completed_MEAN | POS_NAME_CONTRACT_STATUS_Demand_MEAN | POS_NAME_CONTRACT_STATUS_Returned to the store_MEAN | POS_NAME_CONTRACT_STATUS_Signed_MEAN | POS_NAME_CONTRACT_STATUS_XNA_MEAN | POS_NAME_CONTRACT_STATUS_nan_MEAN | POS_COUNT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
SK_ID_CURR | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
100002 | 1.0 | 0 | 0 | 0 | 0 | 202500.0 | 406597.5 | 24700.5 | 351000.0 | 0.018801 | -9461 | -637.0 | -3648.0 | -2120 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | 1.0 | 2 | 2 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0.083037 | 0.262949 | 0.139376 | 0.0247 | 0.0369 | 0.9722 | 0.6192 | 0.0143 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0369 | 0.0202 | 0.0190 | 0.0000 | 0.0000 | 0.0252 | 0.0383 | ... | 24.0 | 24.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -1.0 | -10.000000 | 19.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 19.0 |
100003 | 0.0 | 1 | 0 | 1 | 0 | 270000.0 | 1293502.5 | 35698.5 | 1129500.0 | 0.003541 | -16765 | -1188.0 | -1186.0 | -291 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | 2.0 | 1 | 1 | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0.311267 | 0.622246 | NaN | 0.0959 | 0.0529 | 0.9851 | 0.7960 | 0.0605 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0130 | 0.0773 | 0.0549 | 0.0039 | 0.0098 | 0.0924 | 0.0538 | ... | 10.0 | 30.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -18.0 | -43.785713 | 28.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.928571 | 0.0 | 0.0 | 0.0 | 0.071429 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 28.0 |
2 rows × 642 columns
# 8.0 Read processed installments data
ins_agg = pd.read_csv(
'processed_ins_agg.csv.zip',
nrows = num_rows
)
# 8.0.1 Reduce memory usage by appropriately
# changing data-types per feature:
ins_agg = reducing.Reducer().reduce(ins_agg)
reduced df from 69.9529 MB to 46.3114 MB in 0.32 seconds
# 8.1 Set index
ins_agg.shape # (339587, 26)
ins_agg.head(2)
ins_agg = ins_agg.set_index("SK_ID_CURR")
ins_agg.head(2)
ins_agg.shape # (339587, 25)
(339587, 26)
SK_ID_CURR | INSTAL_NUM_INSTALMENT_VERSION_NUNIQUE | INSTAL_DPD_MAX | INSTAL_DPD_MEAN | INSTAL_DPD_SUM | INSTAL_DBD_MAX | INSTAL_DBD_MEAN | INSTAL_DBD_SUM | INSTAL_PAYMENT_PERC_MAX | INSTAL_PAYMENT_PERC_MEAN | INSTAL_PAYMENT_PERC_VAR | INSTAL_PAYMENT_DIFF_MAX | INSTAL_PAYMENT_DIFF_MEAN | INSTAL_PAYMENT_DIFF_SUM | INSTAL_PAYMENT_DIFF_VAR | INSTAL_AMT_INSTALMENT_MAX | INSTAL_AMT_INSTALMENT_MEAN | INSTAL_AMT_INSTALMENT_SUM | INSTAL_AMT_PAYMENT_MIN | INSTAL_AMT_PAYMENT_MAX | INSTAL_AMT_PAYMENT_MEAN | INSTAL_AMT_PAYMENT_SUM | INSTAL_DAYS_ENTRY_PAYMENT_MAX | INSTAL_DAYS_ENTRY_PAYMENT_MEAN | INSTAL_DAYS_ENTRY_PAYMENT_SUM | INSTAL_COUNT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100001 | 2 | 11.0 | 1.571429 | 11.0 | 36.0 | 8.857142 | 62.0 | 1.0 | 1.0 | 7.564931e-17 | 0.000391 | 0.000077 | 0.000537 | 1.975468e-08 | 17397.900391 | 5885.132324 | 41195.925781 | 3951.000 | 17397.900 | 5885.132143 | 41195.925781 | -1628.0 | -2195.000000 | -15365.0 | 7 |
1 | 100002 | 2 | 0.0 | 0.000000 | 0.0 | 31.0 | 20.421053 | 388.0 | 1.0 | 1.0 | 2.460418e-17 | 0.001094 | 0.000428 | 0.008125 | 2.602025e-08 | 53093.746094 | 11559.247070 | 219625.703125 | 9251.775 | 53093.745 | 11559.247105 | 219625.687500 | -49.0 | -315.421053 | -5993.0 | 19 |
INSTAL_NUM_INSTALMENT_VERSION_NUNIQUE | INSTAL_DPD_MAX | INSTAL_DPD_MEAN | INSTAL_DPD_SUM | INSTAL_DBD_MAX | INSTAL_DBD_MEAN | INSTAL_DBD_SUM | INSTAL_PAYMENT_PERC_MAX | INSTAL_PAYMENT_PERC_MEAN | INSTAL_PAYMENT_PERC_VAR | INSTAL_PAYMENT_DIFF_MAX | INSTAL_PAYMENT_DIFF_MEAN | INSTAL_PAYMENT_DIFF_SUM | INSTAL_PAYMENT_DIFF_VAR | INSTAL_AMT_INSTALMENT_MAX | INSTAL_AMT_INSTALMENT_MEAN | INSTAL_AMT_INSTALMENT_SUM | INSTAL_AMT_PAYMENT_MIN | INSTAL_AMT_PAYMENT_MAX | INSTAL_AMT_PAYMENT_MEAN | INSTAL_AMT_PAYMENT_SUM | INSTAL_DAYS_ENTRY_PAYMENT_MAX | INSTAL_DAYS_ENTRY_PAYMENT_MEAN | INSTAL_DAYS_ENTRY_PAYMENT_SUM | INSTAL_COUNT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
SK_ID_CURR | |||||||||||||||||||||||||
100001 | 2 | 11.0 | 1.571429 | 11.0 | 36.0 | 8.857142 | 62.0 | 1.0 | 1.0 | 7.564931e-17 | 0.000391 | 0.000077 | 0.000537 | 1.975468e-08 | 17397.900391 | 5885.132324 | 41195.925781 | 3951.000 | 17397.900 | 5885.132143 | 41195.925781 | -1628.0 | -2195.000000 | -15365.0 | 7 |
100002 | 2 | 0.0 | 0.000000 | 0.0 | 31.0 | 20.421053 | 388.0 | 1.0 | 1.0 | 2.460418e-17 | 0.001094 | 0.000428 | 0.008125 | 2.602025e-08 | 53093.746094 | 11559.247070 | 219625.703125 | 9251.775 | 53093.745 | 11559.247105 | 219625.687500 | -49.0 | -315.421053 | -5993.0 | 19 |
(339587, 25)
# 9.0 Join Installments data with df
df = df.join(ins_agg, how='left', on='SK_ID_CURR')
df.shape # (356251, 667)
df.head(2)
(356251, 667)
TARGET | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | BASEMENTAREA_MODE | ... | REFUSED_RATE_DOWN_PAYMENT_MAX | REFUSED_RATE_DOWN_PAYMENT_MEAN | REFUSED_DAYS_DECISION_MIN | REFUSED_DAYS_DECISION_MAX | REFUSED_DAYS_DECISION_MEAN | REFUSED_CNT_PAYMENT_MEAN | REFUSED_CNT_PAYMENT_SUM | POS_MONTHS_BALANCE_MAX | POS_MONTHS_BALANCE_MEAN | POS_MONTHS_BALANCE_SIZE | POS_SK_DPD_MAX | POS_SK_DPD_MEAN | POS_SK_DPD_DEF_MAX | POS_SK_DPD_DEF_MEAN | POS_NAME_CONTRACT_STATUS_Active_MEAN | POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN | POS_NAME_CONTRACT_STATUS_Approved_MEAN | POS_NAME_CONTRACT_STATUS_Canceled_MEAN | POS_NAME_CONTRACT_STATUS_Completed_MEAN | POS_NAME_CONTRACT_STATUS_Demand_MEAN | POS_NAME_CONTRACT_STATUS_Returned to the store_MEAN | POS_NAME_CONTRACT_STATUS_Signed_MEAN | POS_NAME_CONTRACT_STATUS_XNA_MEAN | POS_NAME_CONTRACT_STATUS_nan_MEAN | POS_COUNT | INSTAL_NUM_INSTALMENT_VERSION_NUNIQUE | INSTAL_DPD_MAX | INSTAL_DPD_MEAN | INSTAL_DPD_SUM | INSTAL_DBD_MAX | INSTAL_DBD_MEAN | INSTAL_DBD_SUM | INSTAL_PAYMENT_PERC_MAX | INSTAL_PAYMENT_PERC_MEAN | INSTAL_PAYMENT_PERC_VAR | INSTAL_PAYMENT_DIFF_MAX | INSTAL_PAYMENT_DIFF_MEAN | INSTAL_PAYMENT_DIFF_SUM | INSTAL_PAYMENT_DIFF_VAR | INSTAL_AMT_INSTALMENT_MAX | INSTAL_AMT_INSTALMENT_MEAN | INSTAL_AMT_INSTALMENT_SUM | INSTAL_AMT_PAYMENT_MIN | INSTAL_AMT_PAYMENT_MAX | INSTAL_AMT_PAYMENT_MEAN | INSTAL_AMT_PAYMENT_SUM | INSTAL_DAYS_ENTRY_PAYMENT_MAX | INSTAL_DAYS_ENTRY_PAYMENT_MEAN | INSTAL_DAYS_ENTRY_PAYMENT_SUM | INSTAL_COUNT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
SK_ID_CURR | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
100002 | 1.0 | 0 | 0 | 0 | 0 | 202500.0 | 406597.5 | 24700.5 | 351000.0 | 0.018801 | -9461 | -637.0 | -3648.0 | -2120 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | 1.0 | 2 | 2 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0.083037 | 0.262949 | 0.139376 | 0.0247 | 0.0369 | 0.9722 | 0.6192 | 0.0143 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0369 | 0.0202 | 0.0190 | 0.0000 | 0.0000 | 0.0252 | 0.0383 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -1.0 | -10.000000 | 19.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 19.0 | 2.0 | 0.0 | 0.0 | 0.0 | 31.0 | 20.421053 | 388.0 | 1.0 | 1.0 | 2.460418e-17 | 0.001094 | 0.000428 | 0.008125 | 2.602025e-08 | 53093.746094 | 11559.247070 | 2.196257e+05 | 9251.775 | 53093.745 | 11559.247105 | 2.196257e+05 | -49.0 | -315.421053 | -5993.0 | 19.0 |
100003 | 0.0 | 1 | 0 | 1 | 0 | 270000.0 | 1293502.5 | 35698.5 | 1129500.0 | 0.003541 | -16765 | -1188.0 | -1186.0 | -291 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | 2.0 | 1 | 1 | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0.311267 | 0.622246 | NaN | 0.0959 | 0.0529 | 0.9851 | 0.7960 | 0.0605 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0130 | 0.0773 | 0.0549 | 0.0039 | 0.0098 | 0.0924 | 0.0538 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -18.0 | -43.785713 | 28.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.928571 | 0.0 | 0.0 | 0.0 | 0.071429 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 28.0 | 2.0 | 0.0 | 0.0 | 0.0 | 14.0 | 7.160000 | 179.0 | 1.0 | 1.0 | 3.633997e-16 | 0.015000 | -0.000297 | -0.007422 | 1.157042e-05 | 560835.375000 | 64754.585938 | 1.618865e+06 | 6662.970 | 560835.360 | 64754.586000 | 1.618865e+06 | -544.0 | -1385.320000 | -34633.0 | 25.0 |
2 rows × 667 columns
# 10.0 Read Credit card data
cc_agg = pd.read_csv(
'processed_creditCard_agg.csv.zip',
nrows = num_rows
)
# 10.0.1 Reduce memory usage by appropriately
# changing data-types per feature:
cc_agg = reducing.Reducer().reduce(cc_agg)
reduced df from 112.1922 MB to 65.3796 MB in 0.40 seconds
# 10.1 Set Index
cc_agg.shape # (103558, 142)
cc_agg.head(2)
cc_agg = cc_agg.set_index("SK_ID_CURR")
cc_agg.head(2)
cc_agg.shape # (103558, 141)
(103558, 142)
SK_ID_CURR | CC_MONTHS_BALANCE_MIN | CC_MONTHS_BALANCE_MAX | CC_MONTHS_BALANCE_MEAN | CC_MONTHS_BALANCE_SUM | CC_MONTHS_BALANCE_VAR | CC_AMT_BALANCE_MIN | CC_AMT_BALANCE_MAX | CC_AMT_BALANCE_MEAN | CC_AMT_BALANCE_SUM | CC_AMT_BALANCE_VAR | CC_AMT_CREDIT_LIMIT_ACTUAL_MIN | CC_AMT_CREDIT_LIMIT_ACTUAL_MAX | CC_AMT_CREDIT_LIMIT_ACTUAL_MEAN | CC_AMT_CREDIT_LIMIT_ACTUAL_SUM | CC_AMT_CREDIT_LIMIT_ACTUAL_VAR | CC_AMT_DRAWINGS_ATM_CURRENT_MIN | CC_AMT_DRAWINGS_ATM_CURRENT_MAX | CC_AMT_DRAWINGS_ATM_CURRENT_MEAN | CC_AMT_DRAWINGS_ATM_CURRENT_SUM | CC_AMT_DRAWINGS_ATM_CURRENT_VAR | CC_AMT_DRAWINGS_CURRENT_MIN | CC_AMT_DRAWINGS_CURRENT_MAX | CC_AMT_DRAWINGS_CURRENT_MEAN | CC_AMT_DRAWINGS_CURRENT_SUM | CC_AMT_DRAWINGS_CURRENT_VAR | CC_AMT_DRAWINGS_OTHER_CURRENT_MIN | CC_AMT_DRAWINGS_OTHER_CURRENT_MAX | CC_AMT_DRAWINGS_OTHER_CURRENT_MEAN | CC_AMT_DRAWINGS_OTHER_CURRENT_SUM | CC_AMT_DRAWINGS_OTHER_CURRENT_VAR | CC_AMT_DRAWINGS_POS_CURRENT_MIN | CC_AMT_DRAWINGS_POS_CURRENT_MAX | CC_AMT_DRAWINGS_POS_CURRENT_MEAN | CC_AMT_DRAWINGS_POS_CURRENT_SUM | CC_AMT_DRAWINGS_POS_CURRENT_VAR | CC_AMT_INST_MIN_REGULARITY_MIN | CC_AMT_INST_MIN_REGULARITY_MAX | CC_AMT_INST_MIN_REGULARITY_MEAN | CC_AMT_INST_MIN_REGULARITY_SUM | CC_AMT_INST_MIN_REGULARITY_VAR | CC_AMT_PAYMENT_CURRENT_MIN | CC_AMT_PAYMENT_CURRENT_MAX | CC_AMT_PAYMENT_CURRENT_MEAN | CC_AMT_PAYMENT_CURRENT_SUM | CC_AMT_PAYMENT_CURRENT_VAR | CC_AMT_PAYMENT_TOTAL_CURRENT_MIN | CC_AMT_PAYMENT_TOTAL_CURRENT_MAX | CC_AMT_PAYMENT_TOTAL_CURRENT_MEAN | CC_AMT_PAYMENT_TOTAL_CURRENT_SUM | ... | CC_SK_DPD_MAX | CC_SK_DPD_MEAN | CC_SK_DPD_SUM | CC_SK_DPD_VAR | CC_SK_DPD_DEF_MIN | CC_SK_DPD_DEF_MAX | CC_SK_DPD_DEF_MEAN | CC_SK_DPD_DEF_SUM | CC_SK_DPD_DEF_VAR | CC_NAME_CONTRACT_STATUS_Active_MIN | CC_NAME_CONTRACT_STATUS_Active_MAX | CC_NAME_CONTRACT_STATUS_Active_MEAN | CC_NAME_CONTRACT_STATUS_Active_SUM | CC_NAME_CONTRACT_STATUS_Active_VAR | CC_NAME_CONTRACT_STATUS_Approved_MIN | CC_NAME_CONTRACT_STATUS_Approved_MAX | CC_NAME_CONTRACT_STATUS_Approved_MEAN | CC_NAME_CONTRACT_STATUS_Approved_SUM | CC_NAME_CONTRACT_STATUS_Approved_VAR | CC_NAME_CONTRACT_STATUS_Completed_MIN | CC_NAME_CONTRACT_STATUS_Completed_MAX | CC_NAME_CONTRACT_STATUS_Completed_MEAN | CC_NAME_CONTRACT_STATUS_Completed_SUM | CC_NAME_CONTRACT_STATUS_Completed_VAR | CC_NAME_CONTRACT_STATUS_Demand_MIN | CC_NAME_CONTRACT_STATUS_Demand_MAX | CC_NAME_CONTRACT_STATUS_Demand_MEAN | CC_NAME_CONTRACT_STATUS_Demand_SUM | CC_NAME_CONTRACT_STATUS_Demand_VAR | CC_NAME_CONTRACT_STATUS_Refused_MIN | CC_NAME_CONTRACT_STATUS_Refused_MAX | CC_NAME_CONTRACT_STATUS_Refused_MEAN | CC_NAME_CONTRACT_STATUS_Refused_SUM | CC_NAME_CONTRACT_STATUS_Refused_VAR | CC_NAME_CONTRACT_STATUS_Sent proposal_MIN | CC_NAME_CONTRACT_STATUS_Sent proposal_MAX | CC_NAME_CONTRACT_STATUS_Sent proposal_MEAN | CC_NAME_CONTRACT_STATUS_Sent proposal_SUM | CC_NAME_CONTRACT_STATUS_Sent proposal_VAR | CC_NAME_CONTRACT_STATUS_Signed_MIN | CC_NAME_CONTRACT_STATUS_Signed_MAX | CC_NAME_CONTRACT_STATUS_Signed_MEAN | CC_NAME_CONTRACT_STATUS_Signed_SUM | CC_NAME_CONTRACT_STATUS_Signed_VAR | CC_NAME_CONTRACT_STATUS_nan_MIN | CC_NAME_CONTRACT_STATUS_nan_MAX | CC_NAME_CONTRACT_STATUS_nan_MEAN | CC_NAME_CONTRACT_STATUS_nan_SUM | CC_NAME_CONTRACT_STATUS_nan_VAR | CC_COUNT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100006 | -6 | -1 | -3.5 | -21 | 3.5 | 0.0 | 0.0 | 0.000000 | 0.00 | 0.000000e+00 | 270000 | 270000 | 270000.0000 | 1620000 | 0.000000e+00 | NaN | NaN | NaN | 0.0 | NaN | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000e+00 | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | 0.0 | NaN | 0.0 | 0.0 | 0.000000 | 0.0000 | 0.000000e+00 | NaN | NaN | NaN | 0.00 | NaN | 0.0 | 0.0 | 0.000000 | 0.0 | ... | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 1 | 1 | 1.0 | 6 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0 | 0 | 0.0 | 6 |
1 | 100011 | -75 | -2 | -38.5 | -2849 | 462.5 | 0.0 | 189000.0 | 54482.109375 | 4031676.25 | 4.641321e+09 | 90000 | 180000 | 164189.1875 | 12150000 | 1.189060e+09 | 0.0 | 180000.0 | 2432.432432 | 180000.0 | 4.378378e+08 | 0.0 | 180000.0 | 2432.432373 | 180000.0 | 4.378378e+08 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 9000.0 | 3956.221924 | 288804.1875 | 2.013991e+07 | 0.0 | 55485.0 | 4843.064189 | 358386.75 | 5.299260e+07 | 0.0 | 55485.0 | 4520.067383 | 334485.0 | ... | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 1 | 1 | 1.0 | 74 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0 | 0 | 0.0 | 74 |
2 rows × 142 columns
CC_MONTHS_BALANCE_MIN | CC_MONTHS_BALANCE_MAX | CC_MONTHS_BALANCE_MEAN | CC_MONTHS_BALANCE_SUM | CC_MONTHS_BALANCE_VAR | CC_AMT_BALANCE_MIN | CC_AMT_BALANCE_MAX | CC_AMT_BALANCE_MEAN | CC_AMT_BALANCE_SUM | CC_AMT_BALANCE_VAR | CC_AMT_CREDIT_LIMIT_ACTUAL_MIN | CC_AMT_CREDIT_LIMIT_ACTUAL_MAX | CC_AMT_CREDIT_LIMIT_ACTUAL_MEAN | CC_AMT_CREDIT_LIMIT_ACTUAL_SUM | CC_AMT_CREDIT_LIMIT_ACTUAL_VAR | CC_AMT_DRAWINGS_ATM_CURRENT_MIN | CC_AMT_DRAWINGS_ATM_CURRENT_MAX | CC_AMT_DRAWINGS_ATM_CURRENT_MEAN | CC_AMT_DRAWINGS_ATM_CURRENT_SUM | CC_AMT_DRAWINGS_ATM_CURRENT_VAR | CC_AMT_DRAWINGS_CURRENT_MIN | CC_AMT_DRAWINGS_CURRENT_MAX | CC_AMT_DRAWINGS_CURRENT_MEAN | CC_AMT_DRAWINGS_CURRENT_SUM | CC_AMT_DRAWINGS_CURRENT_VAR | CC_AMT_DRAWINGS_OTHER_CURRENT_MIN | CC_AMT_DRAWINGS_OTHER_CURRENT_MAX | CC_AMT_DRAWINGS_OTHER_CURRENT_MEAN | CC_AMT_DRAWINGS_OTHER_CURRENT_SUM | CC_AMT_DRAWINGS_OTHER_CURRENT_VAR | CC_AMT_DRAWINGS_POS_CURRENT_MIN | CC_AMT_DRAWINGS_POS_CURRENT_MAX | CC_AMT_DRAWINGS_POS_CURRENT_MEAN | CC_AMT_DRAWINGS_POS_CURRENT_SUM | CC_AMT_DRAWINGS_POS_CURRENT_VAR | CC_AMT_INST_MIN_REGULARITY_MIN | CC_AMT_INST_MIN_REGULARITY_MAX | CC_AMT_INST_MIN_REGULARITY_MEAN | CC_AMT_INST_MIN_REGULARITY_SUM | CC_AMT_INST_MIN_REGULARITY_VAR | CC_AMT_PAYMENT_CURRENT_MIN | CC_AMT_PAYMENT_CURRENT_MAX | CC_AMT_PAYMENT_CURRENT_MEAN | CC_AMT_PAYMENT_CURRENT_SUM | CC_AMT_PAYMENT_CURRENT_VAR | CC_AMT_PAYMENT_TOTAL_CURRENT_MIN | CC_AMT_PAYMENT_TOTAL_CURRENT_MAX | CC_AMT_PAYMENT_TOTAL_CURRENT_MEAN | CC_AMT_PAYMENT_TOTAL_CURRENT_SUM | CC_AMT_PAYMENT_TOTAL_CURRENT_VAR | ... | CC_SK_DPD_MAX | CC_SK_DPD_MEAN | CC_SK_DPD_SUM | CC_SK_DPD_VAR | CC_SK_DPD_DEF_MIN | CC_SK_DPD_DEF_MAX | CC_SK_DPD_DEF_MEAN | CC_SK_DPD_DEF_SUM | CC_SK_DPD_DEF_VAR | CC_NAME_CONTRACT_STATUS_Active_MIN | CC_NAME_CONTRACT_STATUS_Active_MAX | CC_NAME_CONTRACT_STATUS_Active_MEAN | CC_NAME_CONTRACT_STATUS_Active_SUM | CC_NAME_CONTRACT_STATUS_Active_VAR | CC_NAME_CONTRACT_STATUS_Approved_MIN | CC_NAME_CONTRACT_STATUS_Approved_MAX | CC_NAME_CONTRACT_STATUS_Approved_MEAN | CC_NAME_CONTRACT_STATUS_Approved_SUM | CC_NAME_CONTRACT_STATUS_Approved_VAR | CC_NAME_CONTRACT_STATUS_Completed_MIN | CC_NAME_CONTRACT_STATUS_Completed_MAX | CC_NAME_CONTRACT_STATUS_Completed_MEAN | CC_NAME_CONTRACT_STATUS_Completed_SUM | CC_NAME_CONTRACT_STATUS_Completed_VAR | CC_NAME_CONTRACT_STATUS_Demand_MIN | CC_NAME_CONTRACT_STATUS_Demand_MAX | CC_NAME_CONTRACT_STATUS_Demand_MEAN | CC_NAME_CONTRACT_STATUS_Demand_SUM | CC_NAME_CONTRACT_STATUS_Demand_VAR | CC_NAME_CONTRACT_STATUS_Refused_MIN | CC_NAME_CONTRACT_STATUS_Refused_MAX | CC_NAME_CONTRACT_STATUS_Refused_MEAN | CC_NAME_CONTRACT_STATUS_Refused_SUM | CC_NAME_CONTRACT_STATUS_Refused_VAR | CC_NAME_CONTRACT_STATUS_Sent proposal_MIN | CC_NAME_CONTRACT_STATUS_Sent proposal_MAX | CC_NAME_CONTRACT_STATUS_Sent proposal_MEAN | CC_NAME_CONTRACT_STATUS_Sent proposal_SUM | CC_NAME_CONTRACT_STATUS_Sent proposal_VAR | CC_NAME_CONTRACT_STATUS_Signed_MIN | CC_NAME_CONTRACT_STATUS_Signed_MAX | CC_NAME_CONTRACT_STATUS_Signed_MEAN | CC_NAME_CONTRACT_STATUS_Signed_SUM | CC_NAME_CONTRACT_STATUS_Signed_VAR | CC_NAME_CONTRACT_STATUS_nan_MIN | CC_NAME_CONTRACT_STATUS_nan_MAX | CC_NAME_CONTRACT_STATUS_nan_MEAN | CC_NAME_CONTRACT_STATUS_nan_SUM | CC_NAME_CONTRACT_STATUS_nan_VAR | CC_COUNT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
SK_ID_CURR | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
100006 | -6 | -1 | -3.5 | -21 | 3.5 | 0.0 | 0.0 | 0.000000 | 0.00 | 0.000000e+00 | 270000 | 270000 | 270000.0000 | 1620000 | 0.000000e+00 | NaN | NaN | NaN | 0.0 | NaN | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000e+00 | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | 0.0 | NaN | 0.0 | 0.0 | 0.000000 | 0.0000 | 0.000000e+00 | NaN | NaN | NaN | 0.00 | NaN | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000e+00 | ... | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 1 | 1 | 1.0 | 6 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0 | 0 | 0.0 | 6 |
100011 | -75 | -2 | -38.5 | -2849 | 462.5 | 0.0 | 189000.0 | 54482.109375 | 4031676.25 | 4.641321e+09 | 90000 | 180000 | 164189.1875 | 12150000 | 1.189060e+09 | 0.0 | 180000.0 | 2432.432432 | 180000.0 | 4.378378e+08 | 0.0 | 180000.0 | 2432.432373 | 180000.0 | 4.378378e+08 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 9000.0 | 3956.221924 | 288804.1875 | 2.013991e+07 | 0.0 | 55485.0 | 4843.064189 | 358386.75 | 5.299260e+07 | 0.0 | 55485.0 | 4520.067383 | 334485.0 | 5.585877e+07 | ... | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 1 | 1 | 1.0 | 74 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | 0 | 0 | 0.0 | 74 |
2 rows × 141 columns
(103558, 141)
# 11. Join Credit card data with df
df = df.join(cc_agg, how='left', on='SK_ID_CURR')
df.shape # (356251, 808)
df.head(2)
(356251, 808)
TARGET | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | BASEMENTAREA_MODE | ... | CC_SK_DPD_MAX | CC_SK_DPD_MEAN | CC_SK_DPD_SUM | CC_SK_DPD_VAR | CC_SK_DPD_DEF_MIN | CC_SK_DPD_DEF_MAX | CC_SK_DPD_DEF_MEAN | CC_SK_DPD_DEF_SUM | CC_SK_DPD_DEF_VAR | CC_NAME_CONTRACT_STATUS_Active_MIN | CC_NAME_CONTRACT_STATUS_Active_MAX | CC_NAME_CONTRACT_STATUS_Active_MEAN | CC_NAME_CONTRACT_STATUS_Active_SUM | CC_NAME_CONTRACT_STATUS_Active_VAR | CC_NAME_CONTRACT_STATUS_Approved_MIN | CC_NAME_CONTRACT_STATUS_Approved_MAX | CC_NAME_CONTRACT_STATUS_Approved_MEAN | CC_NAME_CONTRACT_STATUS_Approved_SUM | CC_NAME_CONTRACT_STATUS_Approved_VAR | CC_NAME_CONTRACT_STATUS_Completed_MIN | CC_NAME_CONTRACT_STATUS_Completed_MAX | CC_NAME_CONTRACT_STATUS_Completed_MEAN | CC_NAME_CONTRACT_STATUS_Completed_SUM | CC_NAME_CONTRACT_STATUS_Completed_VAR | CC_NAME_CONTRACT_STATUS_Demand_MIN | CC_NAME_CONTRACT_STATUS_Demand_MAX | CC_NAME_CONTRACT_STATUS_Demand_MEAN | CC_NAME_CONTRACT_STATUS_Demand_SUM | CC_NAME_CONTRACT_STATUS_Demand_VAR | CC_NAME_CONTRACT_STATUS_Refused_MIN | CC_NAME_CONTRACT_STATUS_Refused_MAX | CC_NAME_CONTRACT_STATUS_Refused_MEAN | CC_NAME_CONTRACT_STATUS_Refused_SUM | CC_NAME_CONTRACT_STATUS_Refused_VAR | CC_NAME_CONTRACT_STATUS_Sent proposal_MIN | CC_NAME_CONTRACT_STATUS_Sent proposal_MAX | CC_NAME_CONTRACT_STATUS_Sent proposal_MEAN | CC_NAME_CONTRACT_STATUS_Sent proposal_SUM | CC_NAME_CONTRACT_STATUS_Sent proposal_VAR | CC_NAME_CONTRACT_STATUS_Signed_MIN | CC_NAME_CONTRACT_STATUS_Signed_MAX | CC_NAME_CONTRACT_STATUS_Signed_MEAN | CC_NAME_CONTRACT_STATUS_Signed_SUM | CC_NAME_CONTRACT_STATUS_Signed_VAR | CC_NAME_CONTRACT_STATUS_nan_MIN | CC_NAME_CONTRACT_STATUS_nan_MAX | CC_NAME_CONTRACT_STATUS_nan_MEAN | CC_NAME_CONTRACT_STATUS_nan_SUM | CC_NAME_CONTRACT_STATUS_nan_VAR | CC_COUNT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
SK_ID_CURR | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
100002 | 1.0 | 0 | 0 | 0 | 0 | 202500.0 | 406597.5 | 24700.5 | 351000.0 | 0.018801 | -9461 | -637.0 | -3648.0 | -2120 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | 1.0 | 2 | 2 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0.083037 | 0.262949 | 0.139376 | 0.0247 | 0.0369 | 0.9722 | 0.6192 | 0.0143 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0369 | 0.0202 | 0.0190 | 0.0000 | 0.0000 | 0.0252 | 0.0383 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
100003 | 0.0 | 1 | 0 | 1 | 0 | 270000.0 | 1293502.5 | 35698.5 | 1129500.0 | 0.003541 | -16765 | -1188.0 | -1186.0 | -291 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | 2.0 | 1 | 1 | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0.311267 | 0.622246 | NaN | 0.0959 | 0.0529 | 0.9851 | 0.7960 | 0.0605 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0130 | 0.0773 | 0.0549 | 0.0039 | 0.0098 | 0.0924 | 0.0538 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 rows × 808 columns
# 11.1 Save the results for subsequent use:
df.to_csv("processed_df_joined.csv.zip", compression = "zip")
##################