In [1]:
# Gonna analyze some of the trends of nation-wide loan-taking based on 
#  Lending Club open access dataset.
In [2]:
# our imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [3]:
# Processing the initial dataset. We'll take the part containing the debts
#  fully paid off.
df = pd.read_csv('loans.csv', low_memory=False, header=1)
df = df[df['loan_status']=='Fully Paid']
df = df.drop(['loan_status'], axis=1)
In [4]:
# Forming date column, and picking some other columns of interest...
# Also reindexing the frame with the dates.  
dt = pd.to_datetime(df['issue_d']).dt.to_period('M')
df1 = df[['funded_amnt', 'addr_state', 'total_rec_int']]
df1.set_index(dt, inplace=True)
df1 = df1.sort_index(axis=0, ascending=True)
In [5]:
# Let's take a look at the amount of granted applications nation-wide:
month_earliest = df1.index.min()
month_latest   = df1.index.max()

months = []
m = month_earliest
while (m <= month_latest) :
    months.append(str(m))
    m += 2

month = month_earliest
applications_usa = []
while(month <= month_latest) :
    applications_usa.append(len(df1[str(month)].index))
    month += 2

plt.title('Granted applications nation-wide:')
plt.xlabel('Months')
plt.ylabel('Applications')
plt.xticks(np.arange(0, len(applications_usa), 1), months, rotation=60)
plt.plot(applications_usa)
plt.show()
In [6]:
# Granted loans nation-wide:
loans = []
month = month_earliest
while(month <= month_latest) :
    loans.append(df1[str(month)]['funded_amnt'].sum())
    month += 2

plt.title('Granted loans nation-wide:')
plt.ylabel('Funds in tens of millions of dollars')
plt.xticks(np.arange(0, len(loans), 1), months, rotation=60)
plt.plot(loans)
plt.show()
exit()
In [7]:
# Trends on individual loan size:
year_earliest = df1.index.min().year
year_latest = df1.index.max().year

y = []
year = year_earliest
while(year <= year_latest) :
    y.append(df1[str(year)]['funded_amnt'])
    year += 1

plt.boxplot([y[0], y[1], y[2], y[3], y[4]], labels=['2007','2008','2009','2010','2011'])

plt.title('Trends on individual loans')
plt.show()
In [8]:
# 2008 and 2011 seem like they have a lot of outliers.
#  Let's look at their histograms.
sns.distplot(df1['2008']['funded_amnt'])
plt.title('2008')
plt.xlabel('loan sums')
plt.show()

sns.distplot(df1['2011']['funded_amnt'])
plt.title('2011')
plt.xlabel('loan sums')
plt.show()
In [9]:
# Finally let's see what LC made in interest alone from the end of 2007 
#  through to the end of 2011, month to month:
month = month_earliest
interest_monthly = []
while(month <= month_latest) :
    interest_monthly.append(df1[str(month)]['total_rec_int'].sum())
    month += 2
plt.xticks(np.arange(0, len(interest_monthly), 1), months, rotation=60)
plt.xlabel('Months')
plt.ylabel('Funds')
plt.plot(interest_monthly)
plt.show()
In [10]:
# Looks like the business will keep doing fine, so far!
In [11]:
# My new totally real Lending Club bosses asked me to put the core 2007-2011 data,
#  the select four columns, into a PostgreSQL database.
In [12]:
# our imports
import numpy as np
import pandas as pd

from configparser import ConfigParser

from sqlalchemy import create_engine, MetaData
from sqlalchemy import Table, Column, String, Float, Integer
from sqlalchemy import insert, select
from sqlalchemy import func

import matplotlib.pyplot as plt
import seaborn as sns
In [13]:
# Just like the last time, we'll select the already closed loans.
df = pd.read_csv('loans.csv', low_memory=False, header=1)
df = df[df['loan_status']=='Fully Paid']

# Convert date column to Pandas period for sorting
dt = pd.to_datetime(df['issue_d']).dt.to_period('M')
df1 = df[['funded_amnt', 'addr_state', 'total_rec_int']]
df1.set_index(dt, inplace=True)
df1 = df1.sort_index(axis=0, ascending=True)

# get the date column back
df1 = df1.reset_index()

# save each column into a separate data series
dates = df1['issue_d']
funds = df1['funded_amnt']
state = df1['addr_state']
interest = df1['total_rec_int']
In [14]:
# Now, for later insertion into the database, I'll turn each of the series
#  into a list of dictionaries:

dates_dl    = [ {'id':i, 'issue_d':str(dates[i])}          for i in range(dates.size) ]
funds_dl    = [ {'id':i, 'funded_amnt':str(funds[i])}      for i in range(funds.size) ]
state_dl    = [ {'id':i, 'addr_state':str(state[i])}       for i in range(state.size) ]
interest_dl = [ {'id':i, 'total_rec_int':str(interest[i])} for i in range(interest.size) ]
In [15]:
# Time to create the database, by running psql console with...
> psql: CREATE DATABASE lendingclub;
In [16]:
# The ini file contains the database information - user and password, host and port, and database name.
filename = 'database.ini'
db = {}

parser = ConfigParser()
parser.read(filename)

section = 'postgresql'
params = parser.items(section)
for param in params :
    db[param[0]] = param[1]

format = 'postgresql://{user}:{password}@{host}:{port}/{database}'
connection_format = format.format(**db)

engine = create_engine(connection_format)
In [17]:
# Setting up the structure of the database, we'll make four tables:
metadata = MetaData()
date     = Table('Date', metadata,
                Column('id', Integer()),
                Column('issue_d', String(7)))
funds    = Table('Funds', metadata,
                Column('id', Integer()),
                Column('funded_amnt', Float()))
state    = Table('State', metadata,
                Column('id', Integer()),
                Column('addr_state', String(2)))
interest = Table('Interest', metadata,
                Column('id', Integer()),
                Column('total_rec_int', Float()))
metadata.create_all(engine)

# Checking if the database has received the tables...
engine.table_names()
Out[17]:
['Date', 'Funds', 'State', 'Interest']
In [18]:
# Yup, all there.
# Now we'll insert the data into the tables.

connection = engine.connect()

insert_date     = insert(date)
insert_funds    = insert(funds)
insert_state    = insert(state)
insert_interest = insert(interest)

result_proxy = connection.execute(insert_date,     dates_dl)
result_proxy = connection.execute(insert_funds,    funds_dl)
result_proxy = connection.execute(insert_state,    state_dl)
result_proxy = connection.execute(insert_interest, interest_dl)

# Let's see if the last one succeeded, I'll assume for now that everything's just as hunky-dory...
result_proxy.rowcount
Out[18]:
34116
In [19]:
# Just in case, let's take a look at first 10 Date entries:
stmt = 'SELECT * FROM "Date"'
result_proxy = connection.execute(stmt)
results = result_proxy.fetchmany(5)
for result in results :
    print(result)
(0, '2007-06')
(1, '2007-07')
(2, '2007-07')
(3, '2007-07')
(4, '2007-07')
In [20]:
# Looks like everything's good. I'll close the connection for now.
connection.close()
In [21]:
# Next morning I was requested to see if there are states in America that didn't
#  quite have to have their own LC office.
# Note that in this fictional world LC started with opening an office in each state of the country.

# Let's get connect back to the database and reflect the tables to work with them:
connection = engine.connect()
metadata = MetaData()
metadata.reflect(engine)
date     = metadata.tables['Date']
funds    = metadata.tables['Funds']
state    = metadata.tables['State']
interest = metadata.tables['Interest']
In [22]:
# Let's look at the bottom 5 states in terms of applications:
stmt = """
SELECT addr_state as state, COUNT(*) as count
FROM "State"
GROUP BY state
ORDER BY count
LIMIT 10"""
result_proxy = connection.execute(stmt)
results = result_proxy.fetchall()
for result in results :
    print(result)
('NE', 2)
('ME', 3)
('ID', 5)
('IA', 5)
('IN', 9)
('TN', 15)
('MS', 17)
('VT', 48)
('SD', 52)
('AK', 65)
In [23]:
# Hm. Looks like below Tennessee every state would be just fine with a phone line and online application on the website.
# What about loans granted per state?
stmt = """
SELECT s.addr_state as state, SUM(f.funded_amnt) as funds_total
FROM "Funds" as f
INNER JOIN "State" as s
on f.id = s.id
GROUP BY state
ORDER BY funds_total
"""
result_proxy = connection.execute(stmt)
results = result_proxy.fetchmany(10)
for result in results :
    print(result)
('ME', 9200.0)
('NE', 12300.0)
('IA', 56450.0)
('ID', 56750.0)
('IN', 75675.0)
('TN', 115200.0)
('MS', 124125.0)
('VT', 427600.0)
('SD', 478150.0)
('MT', 726250.0)
In [24]:
# And interest collected?
stmt = """
SELECT s.addr_state as state, SUM(i.total_rec_int) as int_total
FROM "Interest" as i
INNER JOIN "State" as s
on s.id = i.id
GROUP BY state
ORDER BY int_total
LIMIT 10
"""
result_proxy = connection.execute(stmt)
results = result_proxy.fetchall()
for result in results :
    print(result)
('ME', 1503.11)
('NE', 2422.31)
('ID', 7769.07)
('IA', 8016.82)
('IN', 9830.17)
('TN', 15163.73)
('MS', 21886.74)
('VT', 80583.66)
('SD', 91174.09)
('MT', 158714.78)
In [25]:
# Whoa. Nothing below Vermont is even rentable, assuming a modest upkeep of 12.000$ a year per office.

# So we've just cleared 7 offices, let's see if there's any state where we should open any additional, 
#  if at least to compensate for those closed.
# Let's look at the top 10 states applications-wise and save the list.

stmt = """
SELECT addr_state as state, COUNT(*) as count
FROM "State"
GROUP BY state
ORDER BY count DESC
LIMIT 10"""
result_proxy = connection.execute(stmt)
results = result_proxy.fetchall()
df = pd.DataFrame(results)
state_list = list(df[0])
print(state_list)
['CA', 'NY', 'TX', 'FL', 'NJ', 'PA', 'IL', 'VA', 'GA', 'MA']
In [26]:
# Let's join Date and State tables and move that into a dataframe for a pandas treament.
stmt = """
SELECT issue_d, addr_state
FROM "Date" as d
LEFT JOIN "State" as s
ON d.id = s.id
ORDER BY issue_d
"""
results = connection.execute(stmt).fetchall()

df = pd.DataFrame(results)
df.columns = results[0].keys()

df.info()
df.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34116 entries, 0 to 34115
Data columns (total 2 columns):
issue_d       34116 non-null object
addr_state    34116 non-null object
dtypes: object(2)
memory usage: 533.1+ KB
Out[26]:
issue_d addr_state
0 2007-06 MA
1 2007-07 MA
2 2007-07 MA
3 2007-07 WI
4 2007-07 ME
In [27]:
# Again, indexing with a convenient dataframe period.
dt = pd.to_datetime(df['issue_d']).dt.to_period('M')
df = df[['addr_state']]
df.set_index(dt, inplace=True)
In [28]:
years = ['2007', '2008', '2009', '2010', '2011']

dictionary = {}
for state in state_list :
    entry_list = []
    for year in years :
        value = df[year][df[year]['addr_state']==state].count()[0]
        entry_list.append(value)
    dictionary[state] = entry_list
    
stat_df = pd.DataFrame(dictionary)
stat_df.index = years
stat_df
Out[28]:
CA NY TX FL NJ PA IL VA GA MA
2007 3 32 2 16 10 0 2 14 7 27
2008 251 117 102 62 59 39 50 56 49 39
2009 706 405 305 291 185 190 163 149 149 162
2010 1799 1033 716 681 476 432 383 405 365 393
2011 3215 1729 1287 1313 843 676 729 606 614 562
In [29]:
for state in stat_df.columns :
    sns.lineplot(x=years, y=stat_df[state].values, label=state)
In [30]:
# It seems like top 4 states could use additional offices, and maybe even a couple for California.