import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from geopandas.tools import sjoin
import geoplot as gplt
from shapely.geometry import Point, Polygon
mean_cols = ['NO2 Mean', 'O3 Mean', 'SO2 Mean', 'CO Mean']
data_df = pd.read_csv("pollution_us_2000_2016.csv", index_col=0, parse_dates=True)
data_df.sample(2)
data_df.shape
# convert time columns
data_df['Date Local'] = pd.DatetimeIndex(data_df['Date Local'])
data_df['timestamp'] = data_df['Date Local'].astype(np.int64) // 10**9
print("Dataset Covers from",data_df['Date Local'].min(),"to", data_df['Date Local'].max())
# Convert Location Codes to categorical types
data_df['State Code'] = data_df['State Code'].astype('category')
data_df['County Code'] = data_df['County Code'].astype('category')
data_df['Site Num'] = data_df['Site Num'].astype('category')
data_df['State'] = data_df['State'].astype('category')
data_df['County'] = data_df['County'].astype('category')
data_df['City'] = data_df['City'].astype('category')
"Site Num" is not a unique identifier. Group the data on the Address and Date to account for multiple measures on the same day in the same place, then reconstite the location data.
# grab the State, County, and City names associated with an address
address_details = data_df[['Address', 'State', 'County', 'City']].drop_duplicates()
address_details.set_index('Address', inplace=True)
print("Number of unique sites:",len(address_details))
# group the data by Address and Date to get the average measures for each day
data_grouped = pd.DataFrame(data_df.groupby(['Address', 'Date Local']).mean().to_records())
# merge the address details and the grouped data back together
data_df = data_grouped.merge(
address_details,
left_on='Address',
right_index=True)
data_df.describe()
sns.distplot(data_df['NO2 Mean'])
sns.distplot(data_df['O3 Mean'])
sns.distplot(data_df['SO2 Mean'])
sns.distplot(data_df['CO Mean'])
p = sns.pairplot(data_df.groupby('Date Local')[mean_cols].mean())
p.fig.suptitle("Pairplot - US Mean Airborne Pollutant Measures", y=1)
plt.savefig("pairplot.png")
fig, ax = plt.subplots()
fig.set_size_inches(7, 6)
p = sns.heatmap(data_df.groupby('Date Local')[mean_cols].mean().corr(), annot=True, cmap="YlGnBu")
plt.title("Correlation Matrix - US Mean Airborne Pollutant Measures", y=1)
plt.savefig("correlation_matrix.png")
average_measure_by_date = data_df.groupby('Date Local')[mean_cols].mean()
sns.jointplot(x=average_measure_by_date['NO2 Mean'], y=average_measure_by_date['CO Mean'], kind="hex")
plt.savefig("joint_NO2_CO.png")
sns.jointplot(x=average_measure_by_date['NO2 Mean'], y=average_measure_by_date['SO2 Mean'], kind="hex")
plt.savefig("joint_NO2_SO2.png")
sns.jointplot(x=average_measure_by_date['CO Mean'], y=average_measure_by_date['SO2 Mean'], kind="hex")
plt.savefig("joint_CO_SO2.png")
sns.jointplot(x=average_measure_by_date['CO Mean'], y=average_measure_by_date['O3 Mean'], kind="hex")
plt.savefig("joint_CO_O3.png")
data_df.groupby('Date Local')[mean_cols].mean().rolling(365).mean().describe()
# get a rolling average for each pollutant
# levels are highly related to seasons, so we average over 365 days to get a better idea
x = data_df.groupby('Date Local')['NO2 Mean', 'SO2 Mean', 'O3 Mean', 'CO Mean'].mean().rolling(365).mean()
# get the first row with non-zero values - will be the same for all, so we can just pick one
starting_values = x[x['NO2 Mean'].notnull()].iloc[0]
sns.set()
fig, ax = plt.subplots()
fig.set_size_inches(11, 4)
sns.lineplot(
data = x[x.notnull()]/starting_values,
ax=ax
)
plt.title("Percent Change in Average Level of NO2, SO2, O3 and CO since 2001", fontsize=14)
plt.xlabel("Year")
plt.ylabel("Percent Change")
plt.savefig("per_change_overall.png")
# we need this for this function
means_by_year = data_df[['State', 'Date Local']+mean_cols].groupby(data_df['Date Local'].dt.year).mean()
# get the percent difference from the annual mean for each row
def annual_diff_from_mean(row):
year_means = means_by_year.loc[row['Date Local']]
return (row[mean_cols]-year_means)/year_means
def top_bottom_state_by_year(year):
x = by_state_year[by_state_year['Date Local'] == year].sort_values('total_per_diff')
return x.head(1).State.values[0], x.tail(1).State.values[0]
by_state_year = data_df[['State', 'Date Local']+mean_cols].groupby(["State", data_df['Date Local'].dt.year]).mean()
# flatten the dataframe to make it easier to work with
# from https://stackoverflow.com/a/34262133
by_state_year = pd.DataFrame(by_state_year.to_records())
by_state_year[mean_cols] = by_state_year.apply(annual_diff_from_mean, axis=1)
by_state_year['total_per_diff'] = by_state_year[mean_cols].sum(axis=1)
results = []
for year in range(2000, 2017):
x = top_bottom_state_by_year(year)
results.append({
"Year":year,
"Top":x[0],
"Bottom":x[1]
})
results = DataFrame(results)
results.set_index('Year', inplace=True)
results.to_csv("top_and_bottom_states.csv")
results = {}
for state in data_df.State.unique():
results[state] = {}
state_data = data_df[data_df.State == state]
for measure in ['NO2 Mean', 'O3 Mean', 'SO2 Mean', 'CO Mean']:
x = stats.linregress(
state_data['timestamp']//86400,
state_data[measure])
results[state][measure] = x.slope
results = DataFrame(results).transpose()
# average them out and flip the sign - the colors look better that way
results['mean'] = results.mean(axis=1) * -1000 # 000000000000000000
results.head()
us_map = gpd.read_file("states_shape/states.shp")
us_map = us_map[~us_map.STATE_NAME.isin(['Hawaii',"Alaska"])].copy()
us_map = us_map.merge(
results,
how="outer",
left_on="STATE_NAME",
right_index=True
).fillna(0)
us_map[us_map['mean']==0]
ax = us_map.plot(color='white', edgecolor='black', figsize=(12,8))
us_map.fillna(0).plot(ax=ax, column='mean', cmap='Greens', scheme='fisher_jenks', legend=True)
# mark states with no data as grey
us_map[us_map['mean']==0].plot(ax=ax, color="grey")
leg = ax.get_legend()
leg.set_bbox_to_anchor((0., 0., 0.2, 0.2))
plt.title("Continental US States 17 year Change in Airborne Pollutants (PPM/1000 Days)")
ax.set_axis_off()
plt.savefig("chloro_pol_dec.png")
test_df = data_df.copy()
test_df['cali'] = test_df.State=="California"
test_df['cali'] = test_df['cali'].astype('category')
test_df.head(1)
# sns.pairplot(test_df[['NO2 Mean', 'O3 Mean', 'SO2 Mean', 'CO Mean', 'cali']], hue='cali')
cali_df = data_df[data_df.State=="California"].groupby("Date Local").mean()[mean_cols].rolling(365).mean()
other_df = data_df[data_df.State!="California"].groupby("Date Local").mean()[mean_cols].rolling(365).mean()
cali_df.reset_index(inplace=True)
other_df.reset_index(inplace=True)
cali_df['ts'] = cali_df['Date Local'].astype(np.int64) // 10**9
other_df['ts'] = other_df['Date Local'].astype(np.int64) // 10**9
cali_df = cali_df[cali_df['CO Mean'].notnull()]
other_df = other_df[other_df['CO Mean'].notnull()]
def cali_vs_other(measure):
sns.set()
fig, ax = plt.subplots()
fig.set_size_inches(11, 5)
sns.regplot(
x="ts",
y=measure,
data=cali_df,
ax=ax
)
sns.regplot(
x="ts",
y=measure,
data=other_df,
ax=ax
)
ax.legend(['US Average', 'California'])
plt.title("California Vs US Average - {}".format(measure))
plt.xlabel("timestamp")
plt.savefig("cali_vs_{}.png".format(measure))
return plt
cali_vs_other('NO2 Mean')
cali_vs_other('O3 Mean')
cali_vs_other('SO2 Mean')
cali_vs_other('CO Mean')
stats.linregress(cali_df['ts'], cali_df["CO Mean"])
stats.linregress(other_df['ts'], other_df["CO Mean"])
jan_1_2001_ts = 978307200
jan_1_2005_ts = 1104537600
jan_1_2012_ts = 1325376000
rolling_span = 365
cair_states = ['Alabama','Arkansas','Georgia','Illinois','Indiana','Iowa','Kansas','Kentucky','Louisiana',
'Maryland','Michigan','Minnesota','Mississippi','Missouri','Nebraska','New Jersey','New York','North Carolina',
'Ohio','Oklahoma','Pennsylvania','South Carolina','Tennessee','Texas','Virginia','West Virginia','Wisconsin']
pre_CAIR = data_df[
(data_df['Date Local'].dt.year >= 2001) & (data_df['Date Local'].dt.year < 2005) &
(data_df.State.isin(cair_states))
].groupby('Date Local').mean()[['NO2 Mean', 'SO2 Mean']].rolling(rolling_span).mean()
pre_CAIR.reset_index(inplace=True)
pre_CAIR['days_after'] = ((pre_CAIR['Date Local'].astype(np.int64) // 10**9) - jan_1_2001_ts)//86400
pre_CAIR = pre_CAIR[pre_CAIR['NO2 Mean'].notnull()]
pre_CAIR['NO2 Mean'] = pre_CAIR['NO2 Mean']/pre_CAIR.iloc[0]['NO2 Mean']
pre_CAIR['SO2 Mean'] = pre_CAIR['SO2 Mean']/pre_CAIR.iloc[0]['SO2 Mean']
post_CAIR = data_df[
(data_df['Date Local'].dt.year >= 2005) &(data_df['Date Local'].dt.year < 2009) &
(data_df.State.isin(cair_states))
].groupby('Date Local').mean()[['NO2 Mean', 'SO2 Mean']].rolling(rolling_span).mean()
post_CAIR.reset_index(inplace=True)
post_CAIR['days_after'] = ((post_CAIR['Date Local'].astype(np.int64) // 10**9) - jan_1_2005_ts)//86400
post_CAIR = post_CAIR[post_CAIR['NO2 Mean'].notnull()]
post_CAIR['NO2 Mean'] = post_CAIR['NO2 Mean']/post_CAIR.iloc[0]['NO2 Mean']
post_CAIR['SO2 Mean'] = post_CAIR['SO2 Mean']/post_CAIR.iloc[0]['SO2 Mean']
post_CSAPR = data_df[
(data_df['Date Local'].dt.year >= 2012) & (data_df['Date Local'].dt.year < 2016) &
(data_df.State.isin(cair_states))
].groupby('Date Local').mean()[['NO2 Mean', 'SO2 Mean']].rolling(rolling_span).mean()
post_CSAPR.reset_index(inplace=True)
post_CSAPR['days_after'] = ((post_CSAPR['Date Local'].astype(np.int64) // 10**9) - jan_1_2012_ts)//86400
post_CSAPR = post_CSAPR[post_CSAPR['NO2 Mean'].notnull()]
post_CSAPR['NO2 Mean'] = post_CSAPR['NO2 Mean']/post_CSAPR.iloc[0]['NO2 Mean']
post_CSAPR['SO2 Mean'] = post_CSAPR['SO2 Mean']/post_CSAPR.iloc[0]['SO2 Mean']
sns.set()
fig, ax = plt.subplots()
fig.set_size_inches(12, 5)
sns.regplot(
x='days_after',
y='NO2 Mean',
data=pre_CAIR
)
sns.regplot(
x='days_after',
y='NO2 Mean',
data=post_CAIR
)
sns.regplot(
x='days_after',
y='NO2 Mean',
data=post_CSAPR
)
plt.ylim(0, 1.2)
ax.legend(['2001-2005 (Pre-CAIR)','2005-2009 (CAIR)', '2012-2016 (CSAPR)'])
plt.title("Percent Change in NO2 Measures in CAIR/CSAPR States Pre/Post Implementation (365 day rolling average)")
plt.xlabel("Days")
plt.ylabel("Percent Change")
plt.savefig("cair_csapr_no2.png")
sns.set()
fig, ax = plt.subplots()
fig.set_size_inches(12, 5)
sns.regplot(
x='days_after',
y='SO2 Mean',
data=pre_CAIR
)
sns.regplot(
x='days_after',
y='SO2 Mean',
data=post_CAIR
)
sns.regplot(
x='days_after',
y='SO2 Mean',
data=post_CSAPR
)
plt.ylim(0, 1.2)
ax.legend(['2000-2005 (Pre-CAIR)','2005-2009 (CAIR)', '2010-2014 (CSAPR)'])
plt.title("Percent Change in SO2 Measures in CAIR/CSAPR States Pre/Post Implementation (365 day rolling average)")
plt.xlabel("Days")
plt.ylabel("Percent Change")
plt.savefig("cair_csapr_so2.png")
stats.linregress(pre_CAIR['days_after'], pre_CAIR["SO2 Mean"]).slope
data_df = pd.read_csv("pollution_us_2000_2016.csv", index_col=0, parse_dates=True)
data_df.head(1000).to_csv("pollution_us_2000_2016_sample.csv")
DataFrame(data_df.groupby("State").count()['Address'])