# Gonna analyze some of the trends of nation-wide loan-taking based on
# Lending Club open access dataset.
# our imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Processing the initial dataset. We'll take the part containing the debts
# fully paid off.
df = pd.read_csv('loans.csv', low_memory=False, header=1)
df = df[df['loan_status']=='Fully Paid']
df = df.drop(['loan_status'], axis=1)
# Forming date column, and picking some other columns of interest...
# Also reindexing the frame with the dates.
dt = pd.to_datetime(df['issue_d']).dt.to_period('M')
df1 = df[['funded_amnt', 'addr_state', 'total_rec_int']]
df1.set_index(dt, inplace=True)
df1 = df1.sort_index(axis=0, ascending=True)
# Let's take a look at the amount of granted applications nation-wide:
month_earliest = df1.index.min()
month_latest = df1.index.max()
months = []
m = month_earliest
while (m <= month_latest) :
months.append(str(m))
m += 2
month = month_earliest
applications_usa = []
while(month <= month_latest) :
applications_usa.append(len(df1[str(month)].index))
month += 2
plt.title('Granted applications nation-wide:')
plt.xlabel('Months')
plt.ylabel('Applications')
plt.xticks(np.arange(0, len(applications_usa), 1), months, rotation=60)
plt.plot(applications_usa)
plt.show()
# Granted loans nation-wide:
loans = []
month = month_earliest
while(month <= month_latest) :
loans.append(df1[str(month)]['funded_amnt'].sum())
month += 2
plt.title('Granted loans nation-wide:')
plt.ylabel('Funds in tens of millions of dollars')
plt.xticks(np.arange(0, len(loans), 1), months, rotation=60)
plt.plot(loans)
plt.show()
exit()
# Trends on individual loan size:
year_earliest = df1.index.min().year
year_latest = df1.index.max().year
y = []
year = year_earliest
while(year <= year_latest) :
y.append(df1[str(year)]['funded_amnt'])
year += 1
plt.boxplot([y[0], y[1], y[2], y[3], y[4]], labels=['2007','2008','2009','2010','2011'])
plt.title('Trends on individual loans')
plt.show()
# 2008 and 2011 seem like they have a lot of outliers.
# Let's look at their histograms.
sns.distplot(df1['2008']['funded_amnt'])
plt.title('2008')
plt.xlabel('loan sums')
plt.show()
sns.distplot(df1['2011']['funded_amnt'])
plt.title('2011')
plt.xlabel('loan sums')
plt.show()
# Finally let's see what LC made in interest alone from the end of 2007
# through to the end of 2011, month to month:
month = month_earliest
interest_monthly = []
while(month <= month_latest) :
interest_monthly.append(df1[str(month)]['total_rec_int'].sum())
month += 2
plt.xticks(np.arange(0, len(interest_monthly), 1), months, rotation=60)
plt.xlabel('Months')
plt.ylabel('Funds')
plt.plot(interest_monthly)
plt.show()
# Looks like the business will keep doing fine, so far!
# My new totally real Lending Club bosses asked me to put the core 2007-2011 data,
# the select four columns, into a PostgreSQL database.
# our imports
import numpy as np
import pandas as pd
from configparser import ConfigParser
from sqlalchemy import create_engine, MetaData
from sqlalchemy import Table, Column, String, Float, Integer
from sqlalchemy import insert, select
from sqlalchemy import func
import matplotlib.pyplot as plt
import seaborn as sns
# Just like the last time, we'll select the already closed loans.
df = pd.read_csv('loans.csv', low_memory=False, header=1)
df = df[df['loan_status']=='Fully Paid']
# Convert date column to Pandas period for sorting
dt = pd.to_datetime(df['issue_d']).dt.to_period('M')
df1 = df[['funded_amnt', 'addr_state', 'total_rec_int']]
df1.set_index(dt, inplace=True)
df1 = df1.sort_index(axis=0, ascending=True)
# get the date column back
df1 = df1.reset_index()
# save each column into a separate data series
dates = df1['issue_d']
funds = df1['funded_amnt']
state = df1['addr_state']
interest = df1['total_rec_int']
# Now, for later insertion into the database, I'll turn each of the series
# into a list of dictionaries:
dates_dl = [ {'id':i, 'issue_d':str(dates[i])} for i in range(dates.size) ]
funds_dl = [ {'id':i, 'funded_amnt':str(funds[i])} for i in range(funds.size) ]
state_dl = [ {'id':i, 'addr_state':str(state[i])} for i in range(state.size) ]
interest_dl = [ {'id':i, 'total_rec_int':str(interest[i])} for i in range(interest.size) ]
# Time to create the database, by running psql console with...
> psql: CREATE DATABASE lendingclub;
# The ini file contains the database information - user and password, host and port, and database name.
filename = 'database.ini'
db = {}
parser = ConfigParser()
parser.read(filename)
section = 'postgresql'
params = parser.items(section)
for param in params :
db[param[0]] = param[1]
format = 'postgresql://{user}:{password}@{host}:{port}/{database}'
connection_format = format.format(**db)
engine = create_engine(connection_format)
# Setting up the structure of the database, we'll make four tables:
metadata = MetaData()
date = Table('Date', metadata,
Column('id', Integer()),
Column('issue_d', String(7)))
funds = Table('Funds', metadata,
Column('id', Integer()),
Column('funded_amnt', Float()))
state = Table('State', metadata,
Column('id', Integer()),
Column('addr_state', String(2)))
interest = Table('Interest', metadata,
Column('id', Integer()),
Column('total_rec_int', Float()))
metadata.create_all(engine)
# Checking if the database has received the tables...
engine.table_names()
# Yup, all there.
# Now we'll insert the data into the tables.
connection = engine.connect()
insert_date = insert(date)
insert_funds = insert(funds)
insert_state = insert(state)
insert_interest = insert(interest)
result_proxy = connection.execute(insert_date, dates_dl)
result_proxy = connection.execute(insert_funds, funds_dl)
result_proxy = connection.execute(insert_state, state_dl)
result_proxy = connection.execute(insert_interest, interest_dl)
# Let's see if the last one succeeded, I'll assume for now that everything's just as hunky-dory...
result_proxy.rowcount
# Just in case, let's take a look at first 10 Date entries:
stmt = 'SELECT * FROM "Date"'
result_proxy = connection.execute(stmt)
results = result_proxy.fetchmany(5)
for result in results :
print(result)
# Looks like everything's good. I'll close the connection for now.
connection.close()
# Next morning I was requested to see if there are states in America that didn't
# quite have to have their own LC office.
# Note that in this fictional world LC started with opening an office in each state of the country.
# Let's get connect back to the database and reflect the tables to work with them:
connection = engine.connect()
metadata = MetaData()
metadata.reflect(engine)
date = metadata.tables['Date']
funds = metadata.tables['Funds']
state = metadata.tables['State']
interest = metadata.tables['Interest']
# Let's look at the bottom 5 states in terms of applications:
stmt = """
SELECT addr_state as state, COUNT(*) as count
FROM "State"
GROUP BY state
ORDER BY count
LIMIT 10"""
result_proxy = connection.execute(stmt)
results = result_proxy.fetchall()
for result in results :
print(result)
# Hm. Looks like below Tennessee every state would be just fine with a phone line and online application on the website.
# What about loans granted per state?
stmt = """
SELECT s.addr_state as state, SUM(f.funded_amnt) as funds_total
FROM "Funds" as f
INNER JOIN "State" as s
on f.id = s.id
GROUP BY state
ORDER BY funds_total
"""
result_proxy = connection.execute(stmt)
results = result_proxy.fetchmany(10)
for result in results :
print(result)
# And interest collected?
stmt = """
SELECT s.addr_state as state, SUM(i.total_rec_int) as int_total
FROM "Interest" as i
INNER JOIN "State" as s
on s.id = i.id
GROUP BY state
ORDER BY int_total
LIMIT 10
"""
result_proxy = connection.execute(stmt)
results = result_proxy.fetchall()
for result in results :
print(result)
# Whoa. Nothing below Vermont is even rentable, assuming a modest upkeep of 12.000$ a year per office.
# So we've just cleared 7 offices, let's see if there's any state where we should open any additional,
# if at least to compensate for those closed.
# Let's look at the top 10 states applications-wise and save the list.
stmt = """
SELECT addr_state as state, COUNT(*) as count
FROM "State"
GROUP BY state
ORDER BY count DESC
LIMIT 10"""
result_proxy = connection.execute(stmt)
results = result_proxy.fetchall()
df = pd.DataFrame(results)
state_list = list(df[0])
print(state_list)
# Let's join Date and State tables and move that into a dataframe for a pandas treament.
stmt = """
SELECT issue_d, addr_state
FROM "Date" as d
LEFT JOIN "State" as s
ON d.id = s.id
ORDER BY issue_d
"""
results = connection.execute(stmt).fetchall()
df = pd.DataFrame(results)
df.columns = results[0].keys()
df.info()
df.head()
# Again, indexing with a convenient dataframe period.
dt = pd.to_datetime(df['issue_d']).dt.to_period('M')
df = df[['addr_state']]
df.set_index(dt, inplace=True)
years = ['2007', '2008', '2009', '2010', '2011']
dictionary = {}
for state in state_list :
entry_list = []
for year in years :
value = df[year][df[year]['addr_state']==state].count()[0]
entry_list.append(value)
dictionary[state] = entry_list
stat_df = pd.DataFrame(dictionary)
stat_df.index = years
stat_df
for state in stat_df.columns :
sns.lineplot(x=years, y=stat_df[state].values, label=state)
# It seems like top 4 states could use additional offices, and maybe even a couple for California.