import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from pandas import DataFrame, Series

import numpy as np
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

import geopandas as gpd
from geopandas.tools import sjoin
import geoplot as gplt
from shapely.geometry import Point, Polygon

Constants¶

mean_cols = ['NO2 Mean', 'O3 Mean', 'SO2 Mean', 'CO Mean']

Load in the Dataset¶

data_df = pd.read_csv("pollution_us_2000_2016.csv", index_col=0, parse_dates=True)
data_df.sample(2)

data_df.shape

(1746661, 28)

# convert time columns 
data_df['Date Local'] = pd.DatetimeIndex(data_df['Date Local'])
data_df['timestamp'] = data_df['Date Local'].astype(np.int64) // 10**9

print("Dataset Covers from",data_df['Date Local'].min(),"to", data_df['Date Local'].max())

Dataset Covers from 2000-01-01 00:00:00 to 2016-05-31 00:00:00

# Convert Location Codes to categorical types
data_df['State Code'] = data_df['State Code'].astype('category')
data_df['County Code'] = data_df['County Code'].astype('category')
data_df['Site Num'] = data_df['Site Num'].astype('category')

data_df['State'] = data_df['State'].astype('category')
data_df['County'] = data_df['County'].astype('category')
data_df['City'] = data_df['City'].astype('category')

"Site Num" is not a unique identifier. Group the data on the Address and Date to account for multiple measures on the same day in the same place, then reconstite the location data.

# grab the State, County, and City names associated with an address
address_details = data_df[['Address', 'State', 'County', 'City']].drop_duplicates()
address_details.set_index('Address', inplace=True)

print("Number of unique sites:",len(address_details))

Number of unique sites: 204

# group the data by Address and Date to get the average measures for each day
data_grouped = pd.DataFrame(data_df.groupby(['Address', 'Date Local']).mean().to_records())

# merge the address details and the grouped data back together
data_df = data_grouped.merge(
    address_details,
    left_on='Address',
    right_index=True)

data_df.describe()

Data Exploration¶

sns.distplot(data_df['NO2 Mean'])

<matplotlib.axes._subplots.AxesSubplot at 0x1eb7d497278>

sns.distplot(data_df['O3 Mean'])

<matplotlib.axes._subplots.AxesSubplot at 0x1eb0b0c7240>

sns.distplot(data_df['SO2 Mean'])

<matplotlib.axes._subplots.AxesSubplot at 0x1eb0cf6dcc0>

sns.distplot(data_df['CO Mean'])

<matplotlib.axes._subplots.AxesSubplot at 0x1eb6d5f55c0>

p = sns.pairplot(data_df.groupby('Date Local')[mean_cols].mean())
p.fig.suptitle("Pairplot - US Mean Airborne Pollutant Measures", y=1)
plt.savefig("pairplot.png")

fig, ax = plt.subplots()
fig.set_size_inches(7, 6)
p = sns.heatmap(data_df.groupby('Date Local')[mean_cols].mean().corr(), annot=True, cmap="YlGnBu")
plt.title("Correlation Matrix - US Mean Airborne Pollutant Measures", y=1)
plt.savefig("correlation_matrix.png")

average_measure_by_date = data_df.groupby('Date Local')[mean_cols].mean()

sns.jointplot(x=average_measure_by_date['NO2 Mean'], y=average_measure_by_date['CO Mean'], kind="hex")
plt.savefig("joint_NO2_CO.png")

sns.jointplot(x=average_measure_by_date['NO2 Mean'], y=average_measure_by_date['SO2 Mean'], kind="hex")
plt.savefig("joint_NO2_SO2.png")

sns.jointplot(x=average_measure_by_date['CO Mean'], y=average_measure_by_date['SO2 Mean'], kind="hex")
plt.savefig("joint_CO_SO2.png")

sns.jointplot(x=average_measure_by_date['CO Mean'], y=average_measure_by_date['O3 Mean'], kind="hex")
plt.savefig("joint_CO_O3.png")

Q1: Is the general trend in airborne pollutants going up or down, and which states have the highest/lowest measures in a given year? Which are making the most/least progress?¶

Finding the General Trend¶

data_df.groupby('Date Local')[mean_cols].mean().rolling(365).mean().describe()

# get a rolling average for each pollutant 
# levels are highly related to seasons, so we average over 365 days to get a better idea
x = data_df.groupby('Date Local')['NO2 Mean', 'SO2 Mean', 'O3 Mean', 'CO Mean'].mean().rolling(365).mean()
# get the first row with non-zero values - will be the same for all, so we can just pick one
starting_values = x[x['NO2 Mean'].notnull()].iloc[0]

sns.set()
fig, ax = plt.subplots()
fig.set_size_inches(11, 4)
sns.lineplot(
    data = x[x.notnull()]/starting_values,
    ax=ax
)

plt.title("Percent Change in Average Level of NO2, SO2, O3 and CO since 2001", fontsize=14)
plt.xlabel("Year")
plt.ylabel("Percent Change")
plt.savefig("per_change_overall.png")

Finding the Top and Bottom State for Each Year¶

# we need this for this function 
means_by_year = data_df[['State', 'Date Local']+mean_cols].groupby(data_df['Date Local'].dt.year).mean()

# get the percent difference from the annual mean for each row
def annual_diff_from_mean(row):
    year_means = means_by_year.loc[row['Date Local']]
    return (row[mean_cols]-year_means)/year_means

def top_bottom_state_by_year(year):
    x = by_state_year[by_state_year['Date Local'] == year].sort_values('total_per_diff')
    return x.head(1).State.values[0], x.tail(1).State.values[0]

by_state_year = data_df[['State', 'Date Local']+mean_cols].groupby(["State", data_df['Date Local'].dt.year]).mean()

# flatten the dataframe to make it easier to work with
# from https://stackoverflow.com/a/34262133
by_state_year = pd.DataFrame(by_state_year.to_records())

by_state_year[mean_cols] = by_state_year.apply(annual_diff_from_mean, axis=1)

by_state_year['total_per_diff'] = by_state_year[mean_cols].sum(axis=1)

results = []
for year in range(2000, 2017):
    x = top_bottom_state_by_year(year)
    results.append({
        "Year":year,
        "Top":x[0],
        "Bottom":x[1]
    })
results = DataFrame(results)
results.set_index('Year', inplace=True)
results.to_csv("top_and_bottom_states.csv")

Finding the States making the Most/Least Progress¶

results = {}
for state in data_df.State.unique():
    results[state] = {}
    state_data = data_df[data_df.State == state]
    for measure in ['NO2 Mean', 'O3 Mean', 'SO2 Mean', 'CO Mean']:
        x = stats.linregress(
                state_data['timestamp']//86400, 
                state_data[measure])
        results[state][measure] = x.slope
        
results = DataFrame(results).transpose()
# average them out and flip the sign - the colors look better that way
results['mean'] = results.mean(axis=1) * -1000 # 000000000000000000

results.head()

us_map = gpd.read_file("states_shape/states.shp")

us_map = us_map[~us_map.STATE_NAME.isin(['Hawaii',"Alaska"])].copy()

us_map = us_map.merge(
    results,
    how="outer",
    left_on="STATE_NAME",
    right_index=True
).fillna(0)

us_map[us_map['mean']==0]

ax = us_map.plot(color='white', edgecolor='black', figsize=(12,8))
us_map.fillna(0).plot(ax=ax, column='mean', cmap='Greens', scheme='fisher_jenks', legend=True)

# mark states with no data as grey
us_map[us_map['mean']==0].plot(ax=ax, color="grey")

leg = ax.get_legend()
leg.set_bbox_to_anchor((0., 0., 0.2, 0.2))

plt.title("Continental US States 17 year Change in Airborne Pollutants (PPM/1000 Days)")
ax.set_axis_off()
plt.savefig("chloro_pol_dec.png")

Comparing California to the rest of the dataset¶

test_df = data_df.copy()
test_df['cali'] = test_df.State=="California"
test_df['cali'] = test_df['cali'].astype('category')
test_df.head(1)

# sns.pairplot(test_df[['NO2 Mean', 'O3 Mean', 'SO2 Mean', 'CO Mean', 'cali']], hue='cali')

cali_df = data_df[data_df.State=="California"].groupby("Date Local").mean()[mean_cols].rolling(365).mean()
other_df = data_df[data_df.State!="California"].groupby("Date Local").mean()[mean_cols].rolling(365).mean()

cali_df.reset_index(inplace=True)
other_df.reset_index(inplace=True)

cali_df['ts'] = cali_df['Date Local'].astype(np.int64) // 10**9
other_df['ts'] = other_df['Date Local'].astype(np.int64) // 10**9

cali_df = cali_df[cali_df['CO Mean'].notnull()]
other_df = other_df[other_df['CO Mean'].notnull()]

def cali_vs_other(measure):
    sns.set()
    fig, ax = plt.subplots()
    fig.set_size_inches(11, 5)

    sns.regplot(
        x="ts", 
        y=measure, 
        data=cali_df,
        ax=ax
    )

    sns.regplot(
        x="ts", 
        y=measure, 
        data=other_df,
        ax=ax
    )
    
    ax.legend(['US Average', 'California'])

    plt.title("California Vs US Average - {}".format(measure))
    plt.xlabel("timestamp")
    plt.savefig("cali_vs_{}.png".format(measure))
    return plt

cali_vs_other('NO2 Mean')

<module 'matplotlib.pyplot' from 'F:\\Programs\\Anaconda\\lib\\site-packages\\matplotlib\\pyplot.py'>

cali_vs_other('O3 Mean')

<module 'matplotlib.pyplot' from 'F:\\Programs\\Anaconda\\lib\\site-packages\\matplotlib\\pyplot.py'>

cali_vs_other('SO2 Mean')

<module 'matplotlib.pyplot' from 'F:\\Programs\\Anaconda\\lib\\site-packages\\matplotlib\\pyplot.py'>

cali_vs_other('CO Mean')

<module 'matplotlib.pyplot' from 'F:\\Programs\\Anaconda\\lib\\site-packages\\matplotlib\\pyplot.py'>

stats.linregress(cali_df['ts'], cali_df["CO Mean"])

LinregressResult(slope=-7.237765088306528e-10, intercept=1.3227338118980763, rvalue=-0.9460100801922454, pvalue=0.0, stderr=3.3142279043338804e-12)

stats.linregress(other_df['ts'], other_df["CO Mean"])

LinregressResult(slope=-6.083337587817324e-10, intercept=1.101237646519738, rvalue=-0.9665835464465063, pvalue=0.0, stderr=2.150228932931262e-12)

Compare NO2 and SO2 emissions pre and post both the Clean Air Interstate Rule of 2005 and the Cross-State Pollution Rule of 2012¶

jan_1_2001_ts = 978307200
jan_1_2005_ts = 1104537600
jan_1_2012_ts = 1325376000
rolling_span = 365

cair_states = ['Alabama','Arkansas','Georgia','Illinois','Indiana','Iowa','Kansas','Kentucky','Louisiana',
'Maryland','Michigan','Minnesota','Mississippi','Missouri','Nebraska','New Jersey','New York','North Carolina',
'Ohio','Oklahoma','Pennsylvania','South Carolina','Tennessee','Texas','Virginia','West Virginia','Wisconsin']

pre_CAIR = data_df[
    (data_df['Date Local'].dt.year >= 2001) & (data_df['Date Local'].dt.year < 2005) &
    (data_df.State.isin(cair_states))
].groupby('Date Local').mean()[['NO2 Mean', 'SO2 Mean']].rolling(rolling_span).mean()
pre_CAIR.reset_index(inplace=True)
pre_CAIR['days_after'] = ((pre_CAIR['Date Local'].astype(np.int64) // 10**9) - jan_1_2001_ts)//86400
pre_CAIR = pre_CAIR[pre_CAIR['NO2 Mean'].notnull()]

pre_CAIR['NO2 Mean'] = pre_CAIR['NO2 Mean']/pre_CAIR.iloc[0]['NO2 Mean']
pre_CAIR['SO2 Mean'] = pre_CAIR['SO2 Mean']/pre_CAIR.iloc[0]['SO2 Mean']

post_CAIR = data_df[
    (data_df['Date Local'].dt.year >= 2005) &(data_df['Date Local'].dt.year < 2009) &
    (data_df.State.isin(cair_states))
].groupby('Date Local').mean()[['NO2 Mean', 'SO2 Mean']].rolling(rolling_span).mean()
post_CAIR.reset_index(inplace=True)
post_CAIR['days_after'] = ((post_CAIR['Date Local'].astype(np.int64) // 10**9) - jan_1_2005_ts)//86400
post_CAIR = post_CAIR[post_CAIR['NO2 Mean'].notnull()]

post_CAIR['NO2 Mean'] = post_CAIR['NO2 Mean']/post_CAIR.iloc[0]['NO2 Mean']
post_CAIR['SO2 Mean'] = post_CAIR['SO2 Mean']/post_CAIR.iloc[0]['SO2 Mean']

post_CSAPR = data_df[
    (data_df['Date Local'].dt.year >= 2012) & (data_df['Date Local'].dt.year < 2016) &
    (data_df.State.isin(cair_states))
].groupby('Date Local').mean()[['NO2 Mean', 'SO2 Mean']].rolling(rolling_span).mean()
post_CSAPR.reset_index(inplace=True)
post_CSAPR['days_after'] = ((post_CSAPR['Date Local'].astype(np.int64) // 10**9) - jan_1_2012_ts)//86400
post_CSAPR = post_CSAPR[post_CSAPR['NO2 Mean'].notnull()]

post_CSAPR['NO2 Mean'] = post_CSAPR['NO2 Mean']/post_CSAPR.iloc[0]['NO2 Mean']
post_CSAPR['SO2 Mean'] = post_CSAPR['SO2 Mean']/post_CSAPR.iloc[0]['SO2 Mean']

sns.set()
fig, ax = plt.subplots()
fig.set_size_inches(12, 5)

sns.regplot(
    x='days_after',
    y='NO2 Mean',
    data=pre_CAIR
)

sns.regplot(
    x='days_after',
    y='NO2 Mean',
    data=post_CAIR
)

sns.regplot(
    x='days_after',
    y='NO2 Mean',
    data=post_CSAPR
)


plt.ylim(0, 1.2) 

ax.legend(['2001-2005 (Pre-CAIR)','2005-2009 (CAIR)', '2012-2016 (CSAPR)'])
plt.title("Percent Change in NO2 Measures in CAIR/CSAPR States Pre/Post Implementation (365 day rolling average)")
plt.xlabel("Days")
plt.ylabel("Percent Change")

plt.savefig("cair_csapr_no2.png")

sns.set()
fig, ax = plt.subplots()
fig.set_size_inches(12, 5)

sns.regplot(
    x='days_after',
    y='SO2 Mean',
    data=pre_CAIR
)

sns.regplot(
    x='days_after',
    y='SO2 Mean',
    data=post_CAIR
)

sns.regplot(
    x='days_after',
    y='SO2 Mean',
    data=post_CSAPR
)


plt.ylim(0, 1.2) 

ax.legend(['2000-2005 (Pre-CAIR)','2005-2009 (CAIR)', '2010-2014 (CSAPR)'])
plt.title("Percent Change in SO2 Measures in CAIR/CSAPR States Pre/Post Implementation (365 day rolling average)")
plt.xlabel("Days")
plt.ylabel("Percent Change")

plt.savefig("cair_csapr_so2.png")

stats.linregress(pre_CAIR['days_after'], pre_CAIR["SO2 Mean"]).slope

-1.1589459141071498e-07

Make the tiny dataset for the submission¶

data_df = pd.read_csv("pollution_us_2000_2016.csv", index_col=0, parse_dates=True)

data_df.head(1000).to_csv("pollution_us_2000_2016_sample.csv")

DataFrame(data_df.groupby("State").count()['Address'])

	State Code	County Code	Site Num	Address	State	County	City	Date Local	NO2 Units	NO2 Mean	...	SO2 Units	SO2 Mean	SO2 1st Max Value	SO2 1st Max Hour	SO2 AQI	CO Units	CO Mean	CO 1st Max Value	CO 1st Max Hour	CO AQI
98915	48	201	416	7421 Park Place Blvd	Texas	Harris	Houston	2009-12-02	Parts per billion	0.073333	...	Parts per billion	0.775000	1.6	5	NaN	Parts per million	0.045833	0.1	7	NaN
53634	15	3	10	2052 LAUWILIWILI ST	Hawaii	Honolulu	Not in a city	2015-02-10	Parts per billion	2.975000	...	Parts per billion	1.416667	2.6	3	3.0	Parts per million	0.425000	0.5	0	NaN

	NO2 Mean	NO2 1st Max Value	NO2 1st Max Hour	NO2 AQI	O3 Mean	O3 1st Max Value	O3 1st Max Hour	O3 AQI	SO2 Mean	SO2 1st Max Value	SO2 1st Max Hour	SO2 AQI	CO Mean	CO 1st Max Value	CO 1st Max Hour	CO AQI	timestamp
count	412856.000000	412856.000000	412856.000000	412856.000000	412856.000000	412856.000000	412856.000000	412856.000000	412856.000000	412856.000000	412856.000000	412856.000000	412856.000000	412856.000000	412856.000000	412650.000000	4.128560e+05
mean	12.943202	25.557748	11.731099	24.038367	0.026092	0.039184	10.175389	35.964773	1.903953	4.564842	9.639881	7.245721	0.369380	0.626315	7.887068	6.042532	1.218458e+09
std	9.593288	16.118488	7.863331	15.274345	0.011415	0.015316	3.996333	19.623752	2.804391	7.503252	6.336021	12.034155	0.316870	0.627269	6.736911	5.942352	1.474655e+08
min	-2.000000	-2.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-1.725155	-1.400000	0.000000	0.000000	-0.420833	-0.400000	0.000000	0.000000	9.466848e+08
25%	5.816667	13.000000	5.000000	12.000000	0.017792	0.029000	9.000000	25.000000	0.254167	0.800000	4.500000	1.000000	0.185688	0.250000	3.000000	2.000000	1.093219e+09
50%	10.869565	24.000000	9.000000	23.000000	0.025875	0.038000	10.000000	33.000000	0.996875	2.000000	9.500000	3.000000	0.293478	0.450000	6.500000	5.000000	1.223942e+09
75%	17.916667	36.000000	20.000000	34.000000	0.033917	0.048000	11.000000	42.000000	2.390016	5.250000	13.500000	9.000000	0.466666	0.750000	11.000000	8.000000	1.347926e+09
max	139.541667	267.000000	23.000000	132.000000	0.095083	0.141000	23.000000	218.000000	321.612500	350.800000	23.000000	200.000000	7.151993	17.700000	23.000000	201.000000	1.464653e+09

	NO2 Mean	O3 Mean	SO2 Mean	CO Mean
count	5632.000000	5632.000000	5632.000000	5632.000000
mean	13.515436	0.025387	1.990901	0.390788
std	2.798204	0.001099	0.906843	0.102307
min	8.954165	0.022668	0.610055	0.267495
25%	11.144620	0.024670	1.090820	0.302215
50%	12.985655	0.025278	1.974281	0.353929
75%	15.833429	0.026289	2.830212	0.484239
max	18.675477	0.027974	3.694221	0.605798

	CO Mean	NO2 Mean	O3 Mean	SO2 Mean	mean
Virginia	-0.000077	-0.002281	1.228997e-06	-0.001630	0.996557
California	-0.000061	-0.001437	3.147600e-07	-0.000273	0.442694
Nevada	0.000057	0.002616	-2.288686e-06	0.000076	-0.686414
Missouri	-0.000053	-0.000984	-9.198293e-07	-0.000447	0.371343
Pennsylvania	-0.000024	-0.001360	-1.839940e-07	-0.000753	0.534324

	Address
State
Alabama	3126
Alaska	1974
Arizona	69840
Arkansas	35332
California	576142
Colorado	35188
Connecticut	29933
Country Of Mexico	9506
Delaware	3630
District Of Columbia	25696
Florida	25918
Georgia	7722
Hawaii	20276
Idaho	1828
Illinois	50116
Indiana	13926
Iowa	25850
Kansas	31480
Kentucky	14686
Louisiana	23874
Maine	23623
Maryland	23538
Massachusetts	21572
Michigan	8182
Minnesota	3558
Missouri	19778
Nevada	9698
New Hampshire	9294
New Jersey	26732
New Mexico	7130
New York	70487
North Carolina	37126
North Dakota	11018
Ohio	22934
Oklahoma	34420
Oregon	11794
Pennsylvania	188892
Rhode Island	6324
South Carolina	6536
South Dakota	8316
Tennessee	5842
Texas	123208
Utah	8668
Virginia	36422
Washington	962
Wisconsin	1516
Wyoming	13048

	STATE_NAME	DRAWSEQ	STATE_FIPS	SUB_REGION	STATE_ABBR	geometry
2	Montana	3.0	30	Mountain	MT	POLYGON ((-111.4754253002074 44.70216236909688...
9	Vermont	10.0	50	New England	VT	POLYGON ((-73.25806026461467 42.74605836727511...
15	Nebraska	16.0	31	West North Central	NE	POLYGON ((-101.4073932908308 40.00100336471858...
27	District of Columbia	28.0	11	South Atlantic	DC	POLYGON ((-77.007930268107 38.96666736375528, ...
29	West Virginia	30.0	54	South Atlantic	WV	POLYGON ((-79.23166327017802 38.48049636330249...
43	Mississippi	44.0	28	East South Central	MS	POLYGON ((-88.45080327876401 31.43561735674144...