In [48]:
import warnings
warnings.filterwarnings('ignore')
In [49]:
import pandas as pd
from pandas import DataFrame, Series

import numpy as np
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

import geopandas as gpd
from geopandas.tools import sjoin
import geoplot as gplt
from shapely.geometry import Point, Polygon

Constants

In [50]:
mean_cols = ['NO2 Mean', 'O3 Mean', 'SO2 Mean', 'CO Mean']

Load in the Dataset

In [51]:
data_df = pd.read_csv("pollution_us_2000_2016.csv", index_col=0, parse_dates=True)
data_df.sample(2)
Out[51]:
State Code County Code Site Num Address State County City Date Local NO2 Units NO2 Mean ... SO2 Units SO2 Mean SO2 1st Max Value SO2 1st Max Hour SO2 AQI CO Units CO Mean CO 1st Max Value CO 1st Max Hour CO AQI
98915 48 201 416 7421 Park Place Blvd Texas Harris Houston 2009-12-02 Parts per billion 0.073333 ... Parts per billion 0.775000 1.6 5 NaN Parts per million 0.045833 0.1 7 NaN
53634 15 3 10 2052 LAUWILIWILI ST Hawaii Honolulu Not in a city 2015-02-10 Parts per billion 2.975000 ... Parts per billion 1.416667 2.6 3 3.0 Parts per million 0.425000 0.5 0 NaN

2 rows × 28 columns

In [52]:
data_df.shape
Out[52]:
(1746661, 28)
In [53]:
# convert time columns 
data_df['Date Local'] = pd.DatetimeIndex(data_df['Date Local'])
data_df['timestamp'] = data_df['Date Local'].astype(np.int64) // 10**9
In [54]:
print("Dataset Covers from",data_df['Date Local'].min(),"to", data_df['Date Local'].max())
Dataset Covers from 2000-01-01 00:00:00 to 2016-05-31 00:00:00
In [55]:
# Convert Location Codes to categorical types
data_df['State Code'] = data_df['State Code'].astype('category')
data_df['County Code'] = data_df['County Code'].astype('category')
data_df['Site Num'] = data_df['Site Num'].astype('category')

data_df['State'] = data_df['State'].astype('category')
data_df['County'] = data_df['County'].astype('category')
data_df['City'] = data_df['City'].astype('category')

"Site Num" is not a unique identifier. Group the data on the Address and Date to account for multiple measures on the same day in the same place, then reconstite the location data.

In [56]:
# grab the State, County, and City names associated with an address
address_details = data_df[['Address', 'State', 'County', 'City']].drop_duplicates()
address_details.set_index('Address', inplace=True)
In [57]:
print("Number of unique sites:",len(address_details))
Number of unique sites: 204
In [58]:
# group the data by Address and Date to get the average measures for each day
data_grouped = pd.DataFrame(data_df.groupby(['Address', 'Date Local']).mean().to_records())
In [59]:
# merge the address details and the grouped data back together
data_df = data_grouped.merge(
    address_details,
    left_on='Address',
    right_index=True)
In [60]:
data_df.describe()
Out[60]:
NO2 Mean NO2 1st Max Value NO2 1st Max Hour NO2 AQI O3 Mean O3 1st Max Value O3 1st Max Hour O3 AQI SO2 Mean SO2 1st Max Value SO2 1st Max Hour SO2 AQI CO Mean CO 1st Max Value CO 1st Max Hour CO AQI timestamp
count 412856.000000 412856.000000 412856.000000 412856.000000 412856.000000 412856.000000 412856.000000 412856.000000 412856.000000 412856.000000 412856.000000 412856.000000 412856.000000 412856.000000 412856.000000 412650.000000 4.128560e+05
mean 12.943202 25.557748 11.731099 24.038367 0.026092 0.039184 10.175389 35.964773 1.903953 4.564842 9.639881 7.245721 0.369380 0.626315 7.887068 6.042532 1.218458e+09
std 9.593288 16.118488 7.863331 15.274345 0.011415 0.015316 3.996333 19.623752 2.804391 7.503252 6.336021 12.034155 0.316870 0.627269 6.736911 5.942352 1.474655e+08
min -2.000000 -2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -1.725155 -1.400000 0.000000 0.000000 -0.420833 -0.400000 0.000000 0.000000 9.466848e+08
25% 5.816667 13.000000 5.000000 12.000000 0.017792 0.029000 9.000000 25.000000 0.254167 0.800000 4.500000 1.000000 0.185688 0.250000 3.000000 2.000000 1.093219e+09
50% 10.869565 24.000000 9.000000 23.000000 0.025875 0.038000 10.000000 33.000000 0.996875 2.000000 9.500000 3.000000 0.293478 0.450000 6.500000 5.000000 1.223942e+09
75% 17.916667 36.000000 20.000000 34.000000 0.033917 0.048000 11.000000 42.000000 2.390016 5.250000 13.500000 9.000000 0.466666 0.750000 11.000000 8.000000 1.347926e+09
max 139.541667 267.000000 23.000000 132.000000 0.095083 0.141000 23.000000 218.000000 321.612500 350.800000 23.000000 200.000000 7.151993 17.700000 23.000000 201.000000 1.464653e+09

Data Exploration

In [61]:
sns.distplot(data_df['NO2 Mean'])
Out[61]:
<matplotlib.axes._subplots.AxesSubplot at 0x1eb7d497278>
In [62]:
sns.distplot(data_df['O3 Mean'])
Out[62]:
<matplotlib.axes._subplots.AxesSubplot at 0x1eb0b0c7240>
In [63]:
sns.distplot(data_df['SO2 Mean'])
Out[63]:
<matplotlib.axes._subplots.AxesSubplot at 0x1eb0cf6dcc0>
In [64]:
sns.distplot(data_df['CO Mean'])
Out[64]:
<matplotlib.axes._subplots.AxesSubplot at 0x1eb6d5f55c0>
In [65]:
p = sns.pairplot(data_df.groupby('Date Local')[mean_cols].mean())
p.fig.suptitle("Pairplot - US Mean Airborne Pollutant Measures", y=1)
plt.savefig("pairplot.png")
In [66]:
fig, ax = plt.subplots()
fig.set_size_inches(7, 6)
p = sns.heatmap(data_df.groupby('Date Local')[mean_cols].mean().corr(), annot=True, cmap="YlGnBu")
plt.title("Correlation Matrix - US Mean Airborne Pollutant Measures", y=1)
plt.savefig("correlation_matrix.png")
In [67]:
average_measure_by_date = data_df.groupby('Date Local')[mean_cols].mean()
In [68]:
sns.jointplot(x=average_measure_by_date['NO2 Mean'], y=average_measure_by_date['CO Mean'], kind="hex")
plt.savefig("joint_NO2_CO.png")
In [69]:
sns.jointplot(x=average_measure_by_date['NO2 Mean'], y=average_measure_by_date['SO2 Mean'], kind="hex")
plt.savefig("joint_NO2_SO2.png")
In [70]:
sns.jointplot(x=average_measure_by_date['CO Mean'], y=average_measure_by_date['SO2 Mean'], kind="hex")
plt.savefig("joint_CO_SO2.png")
In [71]:
sns.jointplot(x=average_measure_by_date['CO Mean'], y=average_measure_by_date['O3 Mean'], kind="hex")
plt.savefig("joint_CO_O3.png")

Q1: Is the general trend in airborne pollutants going up or down, and which states have the highest/lowest measures in a given year? Which are making the most/least progress?

Finding the General Trend

In [72]:
data_df.groupby('Date Local')[mean_cols].mean().rolling(365).mean().describe()
Out[72]:
NO2 Mean O3 Mean SO2 Mean CO Mean
count 5632.000000 5632.000000 5632.000000 5632.000000
mean 13.515436 0.025387 1.990901 0.390788
std 2.798204 0.001099 0.906843 0.102307
min 8.954165 0.022668 0.610055 0.267495
25% 11.144620 0.024670 1.090820 0.302215
50% 12.985655 0.025278 1.974281 0.353929
75% 15.833429 0.026289 2.830212 0.484239
max 18.675477 0.027974 3.694221 0.605798
In [73]:
# get a rolling average for each pollutant 
# levels are highly related to seasons, so we average over 365 days to get a better idea
x = data_df.groupby('Date Local')['NO2 Mean', 'SO2 Mean', 'O3 Mean', 'CO Mean'].mean().rolling(365).mean()
# get the first row with non-zero values - will be the same for all, so we can just pick one
starting_values = x[x['NO2 Mean'].notnull()].iloc[0]

sns.set()
fig, ax = plt.subplots()
fig.set_size_inches(11, 4)
sns.lineplot(
    data = x[x.notnull()]/starting_values,
    ax=ax
)

plt.title("Percent Change in Average Level of NO2, SO2, O3 and CO since 2001", fontsize=14)
plt.xlabel("Year")
plt.ylabel("Percent Change")
plt.savefig("per_change_overall.png")

Finding the Top and Bottom State for Each Year

In [74]:
# we need this for this function 
means_by_year = data_df[['State', 'Date Local']+mean_cols].groupby(data_df['Date Local'].dt.year).mean()

# get the percent difference from the annual mean for each row
def annual_diff_from_mean(row):
    year_means = means_by_year.loc[row['Date Local']]
    return (row[mean_cols]-year_means)/year_means
In [75]:
def top_bottom_state_by_year(year):
    x = by_state_year[by_state_year['Date Local'] == year].sort_values('total_per_diff')
    return x.head(1).State.values[0], x.tail(1).State.values[0]
In [76]:
by_state_year = data_df[['State', 'Date Local']+mean_cols].groupby(["State", data_df['Date Local'].dt.year]).mean()

# flatten the dataframe to make it easier to work with
# from https://stackoverflow.com/a/34262133
by_state_year = pd.DataFrame(by_state_year.to_records())

by_state_year[mean_cols] = by_state_year.apply(annual_diff_from_mean, axis=1)

by_state_year['total_per_diff'] = by_state_year[mean_cols].sum(axis=1)
In [77]:
results = []
for year in range(2000, 2017):
    x = top_bottom_state_by_year(year)
    results.append({
        "Year":year,
        "Top":x[0],
        "Bottom":x[1]
    })
results = DataFrame(results)
results.set_index('Year', inplace=True)
results.to_csv("top_and_bottom_states.csv")

Finding the States making the Most/Least Progress

In [78]:
results = {}
for state in data_df.State.unique():
    results[state] = {}
    state_data = data_df[data_df.State == state]
    for measure in ['NO2 Mean', 'O3 Mean', 'SO2 Mean', 'CO Mean']:
        x = stats.linregress(
                state_data['timestamp']//86400, 
                state_data[measure])
        results[state][measure] = x.slope
        
results = DataFrame(results).transpose()
# average them out and flip the sign - the colors look better that way
results['mean'] = results.mean(axis=1) * -1000 # 000000000000000000
In [79]:
results.head()
Out[79]:
CO Mean NO2 Mean O3 Mean SO2 Mean mean
Virginia -0.000077 -0.002281 1.228997e-06 -0.001630 0.996557
California -0.000061 -0.001437 3.147600e-07 -0.000273 0.442694
Nevada 0.000057 0.002616 -2.288686e-06 0.000076 -0.686414
Missouri -0.000053 -0.000984 -9.198293e-07 -0.000447 0.371343
Pennsylvania -0.000024 -0.001360 -1.839940e-07 -0.000753 0.534324
In [80]:
us_map = gpd.read_file("states_shape/states.shp")

us_map = us_map[~us_map.STATE_NAME.isin(['Hawaii',"Alaska"])].copy()

us_map = us_map.merge(
    results,
    how="outer",
    left_on="STATE_NAME",
    right_index=True
).fillna(0)
In [106]:
us_map[us_map['mean']==0]
Out[106]:
STATE_NAME DRAWSEQ STATE_FIPS SUB_REGION STATE_ABBR geometry CO Mean NO2 Mean O3 Mean SO2 Mean mean
2 Montana 3.0 30 Mountain MT POLYGON ((-111.4754253002074 44.70216236909688... 0.0 0.0 0.0 0.0 0.0
9 Vermont 10.0 50 New England VT POLYGON ((-73.25806026461467 42.74605836727511... 0.0 0.0 0.0 0.0 0.0
15 Nebraska 16.0 31 West North Central NE POLYGON ((-101.4073932908308 40.00100336471858... 0.0 0.0 0.0 0.0 0.0
27 District of Columbia 28.0 11 South Atlantic DC POLYGON ((-77.007930268107 38.96666736375528, ... 0.0 0.0 0.0 0.0 0.0
29 West Virginia 30.0 54 South Atlantic WV POLYGON ((-79.23166327017802 38.48049636330249... 0.0 0.0 0.0 0.0 0.0
43 Mississippi 44.0 28 East South Central MS POLYGON ((-88.45080327876401 31.43561735674144... 0.0 0.0 0.0 0.0 0.0
In [109]:
ax = us_map.plot(color='white', edgecolor='black', figsize=(12,8))
us_map.fillna(0).plot(ax=ax, column='mean', cmap='Greens', scheme='fisher_jenks', legend=True)

# mark states with no data as grey
us_map[us_map['mean']==0].plot(ax=ax, color="grey")

leg = ax.get_legend()
leg.set_bbox_to_anchor((0., 0., 0.2, 0.2))

plt.title("Continental US States 17 year Change in Airborne Pollutants (PPM/1000 Days)")
ax.set_axis_off()
plt.savefig("chloro_pol_dec.png")

Comparing California to the rest of the dataset

In [83]:
test_df = data_df.copy()
test_df['cali'] = test_df.State=="California"
test_df['cali'] = test_df['cali'].astype('category')
test_df.head(1)
Out[83]:
Address Date Local NO2 Mean NO2 1st Max Value NO2 1st Max Hour NO2 AQI O3 Mean O3 1st Max Value O3 1st Max Hour O3 AQI ... SO2 AQI CO Mean CO 1st Max Value CO 1st Max Hour CO AQI timestamp State County City cali
0 6100 ARLINGTON BLVD MONTG WARD 2000-01-01 31.0 41.0 23.0 39.0 0.00275 0.006 11.0 5.0 ... 26.0 1.244518 1.5 2.5 16.0 946684800.0 Virginia Fairfax Seven Corners False

1 rows × 23 columns

In [84]:
# sns.pairplot(test_df[['NO2 Mean', 'O3 Mean', 'SO2 Mean', 'CO Mean', 'cali']], hue='cali')
In [85]:
cali_df = data_df[data_df.State=="California"].groupby("Date Local").mean()[mean_cols].rolling(365).mean()
other_df = data_df[data_df.State!="California"].groupby("Date Local").mean()[mean_cols].rolling(365).mean()

cali_df.reset_index(inplace=True)
other_df.reset_index(inplace=True)

cali_df['ts'] = cali_df['Date Local'].astype(np.int64) // 10**9
other_df['ts'] = other_df['Date Local'].astype(np.int64) // 10**9

cali_df = cali_df[cali_df['CO Mean'].notnull()]
other_df = other_df[other_df['CO Mean'].notnull()]
In [86]:
def cali_vs_other(measure):
    sns.set()
    fig, ax = plt.subplots()
    fig.set_size_inches(11, 5)

    sns.regplot(
        x="ts", 
        y=measure, 
        data=cali_df,
        ax=ax
    )

    sns.regplot(
        x="ts", 
        y=measure, 
        data=other_df,
        ax=ax
    )
    
    ax.legend(['US Average', 'California'])

    plt.title("California Vs US Average - {}".format(measure))
    plt.xlabel("timestamp")
    plt.savefig("cali_vs_{}.png".format(measure))
    return plt
In [87]:
cali_vs_other('NO2 Mean')
Out[87]:
<module 'matplotlib.pyplot' from 'F:\\Programs\\Anaconda\\lib\\site-packages\\matplotlib\\pyplot.py'>
In [88]:
cali_vs_other('O3 Mean')
Out[88]:
<module 'matplotlib.pyplot' from 'F:\\Programs\\Anaconda\\lib\\site-packages\\matplotlib\\pyplot.py'>
In [89]:
cali_vs_other('SO2 Mean')
Out[89]:
<module 'matplotlib.pyplot' from 'F:\\Programs\\Anaconda\\lib\\site-packages\\matplotlib\\pyplot.py'>
In [90]:
cali_vs_other('CO Mean')
Out[90]:
<module 'matplotlib.pyplot' from 'F:\\Programs\\Anaconda\\lib\\site-packages\\matplotlib\\pyplot.py'>
In [91]:
stats.linregress(cali_df['ts'], cali_df["CO Mean"])
Out[91]:
LinregressResult(slope=-7.237765088306528e-10, intercept=1.3227338118980763, rvalue=-0.9460100801922454, pvalue=0.0, stderr=3.3142279043338804e-12)
In [92]:
stats.linregress(other_df['ts'], other_df["CO Mean"])
Out[92]:
LinregressResult(slope=-6.083337587817324e-10, intercept=1.101237646519738, rvalue=-0.9665835464465063, pvalue=0.0, stderr=2.150228932931262e-12)

Compare NO2 and SO2 emissions pre and post both the Clean Air Interstate Rule of 2005 and the Cross-State Pollution Rule of 2012

In [93]:
jan_1_2001_ts = 978307200
jan_1_2005_ts = 1104537600
jan_1_2012_ts = 1325376000
rolling_span = 365
In [94]:
cair_states = ['Alabama','Arkansas','Georgia','Illinois','Indiana','Iowa','Kansas','Kentucky','Louisiana',
'Maryland','Michigan','Minnesota','Mississippi','Missouri','Nebraska','New Jersey','New York','North Carolina',
'Ohio','Oklahoma','Pennsylvania','South Carolina','Tennessee','Texas','Virginia','West Virginia','Wisconsin']
In [95]:
pre_CAIR = data_df[
    (data_df['Date Local'].dt.year >= 2001) & (data_df['Date Local'].dt.year < 2005) &
    (data_df.State.isin(cair_states))
].groupby('Date Local').mean()[['NO2 Mean', 'SO2 Mean']].rolling(rolling_span).mean()
pre_CAIR.reset_index(inplace=True)
pre_CAIR['days_after'] = ((pre_CAIR['Date Local'].astype(np.int64) // 10**9) - jan_1_2001_ts)//86400
pre_CAIR = pre_CAIR[pre_CAIR['NO2 Mean'].notnull()]

pre_CAIR['NO2 Mean'] = pre_CAIR['NO2 Mean']/pre_CAIR.iloc[0]['NO2 Mean']
pre_CAIR['SO2 Mean'] = pre_CAIR['SO2 Mean']/pre_CAIR.iloc[0]['SO2 Mean']
In [96]:
post_CAIR = data_df[
    (data_df['Date Local'].dt.year >= 2005) &(data_df['Date Local'].dt.year < 2009) &
    (data_df.State.isin(cair_states))
].groupby('Date Local').mean()[['NO2 Mean', 'SO2 Mean']].rolling(rolling_span).mean()
post_CAIR.reset_index(inplace=True)
post_CAIR['days_after'] = ((post_CAIR['Date Local'].astype(np.int64) // 10**9) - jan_1_2005_ts)//86400
post_CAIR = post_CAIR[post_CAIR['NO2 Mean'].notnull()]

post_CAIR['NO2 Mean'] = post_CAIR['NO2 Mean']/post_CAIR.iloc[0]['NO2 Mean']
post_CAIR['SO2 Mean'] = post_CAIR['SO2 Mean']/post_CAIR.iloc[0]['SO2 Mean']
In [97]:
post_CSAPR = data_df[
    (data_df['Date Local'].dt.year >= 2012) & (data_df['Date Local'].dt.year < 2016) &
    (data_df.State.isin(cair_states))
].groupby('Date Local').mean()[['NO2 Mean', 'SO2 Mean']].rolling(rolling_span).mean()
post_CSAPR.reset_index(inplace=True)
post_CSAPR['days_after'] = ((post_CSAPR['Date Local'].astype(np.int64) // 10**9) - jan_1_2012_ts)//86400
post_CSAPR = post_CSAPR[post_CSAPR['NO2 Mean'].notnull()]

post_CSAPR['NO2 Mean'] = post_CSAPR['NO2 Mean']/post_CSAPR.iloc[0]['NO2 Mean']
post_CSAPR['SO2 Mean'] = post_CSAPR['SO2 Mean']/post_CSAPR.iloc[0]['SO2 Mean']
In [98]:
sns.set()
fig, ax = plt.subplots()
fig.set_size_inches(12, 5)

sns.regplot(
    x='days_after',
    y='NO2 Mean',
    data=pre_CAIR
)

sns.regplot(
    x='days_after',
    y='NO2 Mean',
    data=post_CAIR
)

sns.regplot(
    x='days_after',
    y='NO2 Mean',
    data=post_CSAPR
)


plt.ylim(0, 1.2) 

ax.legend(['2001-2005 (Pre-CAIR)','2005-2009 (CAIR)', '2012-2016 (CSAPR)'])
plt.title("Percent Change in NO2 Measures in CAIR/CSAPR States Pre/Post Implementation (365 day rolling average)")
plt.xlabel("Days")
plt.ylabel("Percent Change")

plt.savefig("cair_csapr_no2.png")
In [99]:
sns.set()
fig, ax = plt.subplots()
fig.set_size_inches(12, 5)

sns.regplot(
    x='days_after',
    y='SO2 Mean',
    data=pre_CAIR
)

sns.regplot(
    x='days_after',
    y='SO2 Mean',
    data=post_CAIR
)

sns.regplot(
    x='days_after',
    y='SO2 Mean',
    data=post_CSAPR
)


plt.ylim(0, 1.2) 

ax.legend(['2000-2005 (Pre-CAIR)','2005-2009 (CAIR)', '2010-2014 (CSAPR)'])
plt.title("Percent Change in SO2 Measures in CAIR/CSAPR States Pre/Post Implementation (365 day rolling average)")
plt.xlabel("Days")
plt.ylabel("Percent Change")

plt.savefig("cair_csapr_so2.png")
In [100]:
stats.linregress(pre_CAIR['days_after'], pre_CAIR["SO2 Mean"]).slope
Out[100]:
-1.1589459141071498e-07

Make the tiny dataset for the submission

In [101]:
data_df = pd.read_csv("pollution_us_2000_2016.csv", index_col=0, parse_dates=True)
In [102]:
data_df.head(1000).to_csv("pollution_us_2000_2016_sample.csv")
In [103]:
DataFrame(data_df.groupby("State").count()['Address'])
Out[103]:
Address
State
Alabama 3126
Alaska 1974
Arizona 69840
Arkansas 35332
California 576142
Colorado 35188
Connecticut 29933
Country Of Mexico 9506
Delaware 3630
District Of Columbia 25696
Florida 25918
Georgia 7722
Hawaii 20276
Idaho 1828
Illinois 50116
Indiana 13926
Iowa 25850
Kansas 31480
Kentucky 14686
Louisiana 23874
Maine 23623
Maryland 23538
Massachusetts 21572
Michigan 8182
Minnesota 3558
Missouri 19778
Nevada 9698
New Hampshire 9294
New Jersey 26732
New Mexico 7130
New York 70487
North Carolina 37126
North Dakota 11018
Ohio 22934
Oklahoma 34420
Oregon 11794
Pennsylvania 188892
Rhode Island 6324
South Carolina 6536
South Dakota 8316
Tennessee 5842
Texas 123208
Utah 8668
Virginia 36422
Washington 962
Wisconsin 1516
Wyoming 13048