import warnings
warnings.filterwarnings('ignore')

import os, math, random, re
from datetime import datetime, timedelta, date

import pandas as pd
from pandas import DataFrame, Series
import numpy as np

from scipy.sparse import lil_matrix

import pysal
import pysal.esda as esda
import pysal.esda.mapclassify as mc

import geopandas as gpd
from geopandas.tools import sjoin
import geoplot as gplt
from shapely.geometry import Point, Polygon

import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns

from util import *

Data Loading¶

%%time
detail_data=DataFrame()
loaded_files = []
for filename in os.listdir("data/details"):
    if filename.endswith(".csv"):
        loaded_files.append(pd.read_csv("data/details/"+filename))
detail_data = pd.concat(loaded_files)
detail_data.reset_index(drop=True,inplace=True)

Wall time: 23.3 s

detail_data.head()

Convert the Time related strings to a DateTime column¶

%%time
# Pad the "DAY" columns to length 2 so the date parser can do it's thing
detail_data.BEGIN_DAY = detail_data.BEGIN_DAY.map(str).str.pad(2,fillchar='0')
detail_data.END_DAY = detail_data.END_DAY.map(str).str.pad(2,fillchar='0')
# ditto with the Time
detail_data.BEGIN_TIME = detail_data.BEGIN_TIME.map(str).str.pad(4,fillchar='0')
detail_data.END_TIME = detail_data.END_TIME.map(str).str.pad(4,fillchar='0')
# create a new column by concating the three date/time related columns and convert the result to a datetime
detail_data['BEGIN_DATE']=detail_data.BEGIN_YEARMONTH.map(str)+" "+detail_data.BEGIN_DAY.map(str)+" "+detail_data.BEGIN_TIME.map(str)
detail_data.BEGIN_DATE=pd.to_datetime(detail_data.BEGIN_DATE, format='%Y%m %d %H%M', errors='coerce')

detail_data['END_DATE']=detail_data.END_YEARMONTH.map(str)+" "+detail_data.END_DAY.map(str)+" "+detail_data.END_TIME.map(str)
detail_data.END_DATE=pd.to_datetime(detail_data.END_DATE, format='%Y%m %d %H%M', errors='coerce')

# drop the old columns
detail_data = detail_data.drop(['BEGIN_YEARMONTH', "END_YEARMONTH",'BEGIN_DAY', "END_DAY", "BEGIN_TIME", "END_TIME"], axis=1)
detail_data = detail_data.drop(['YEAR', "MONTH_NAME",'END_DATE_TIME', "BEGIN_DATE_TIME"], axis=1)

# Columns I don't think I need at the momment
detail_data = detail_data.drop(['WFO', 'SOURCE',
       'MAGNITUDE', 'MAGNITUDE_TYPE', 'FLOOD_CAUSE', 'CATEGORY', 'TOR_F_SCALE',
       'TOR_LENGTH', 'TOR_WIDTH', 'TOR_OTHER_WFO', 'TOR_OTHER_CZ_STATE',
       'TOR_OTHER_CZ_FIPS', 'TOR_OTHER_CZ_NAME', 'BEGIN_RANGE',
       'BEGIN_AZIMUTH', 'BEGIN_LOCATION', 'END_RANGE', 'END_AZIMUTH',
       'END_LOCATION','EPISODE_NARRATIVE', 'EVENT_NARRATIVE', 'DATA_SOURCE'], axis=1)

# columns I'm less sure I don't need, but it'll make things easier for the time being
detail_data = detail_data.drop(['INJURIES_DIRECT',
       'INJURIES_INDIRECT', 'DEATHS_DIRECT', 'DEATHS_INDIRECT',
       'DAMAGE_PROPERTY', 'DAMAGE_CROPS'], axis=1)

Wall time: 28.8 s

Correcting input errors¶

There's some input errors with the Lat/Long coordinates - decimal place is just shifted, so dividing those by ten gives us the correct value (Affects 2080 rows)

detail_data.columns

Index(['EPISODE_ID', 'EVENT_ID', 'STATE', 'STATE_FIPS', 'EVENT_TYPE',
       'CZ_TYPE', 'CZ_FIPS', 'CZ_NAME', 'CZ_TIMEZONE', 'BEGIN_LAT',
       'BEGIN_LON', 'END_LAT', 'END_LON', 'BEGIN_DATE', 'END_DATE'],
      dtype='object')

detail_data.loc[detail_data.BEGIN_LON < -180, 'BEGIN_LON']=detail_data.loc[detail_data.BEGIN_LON < -180, 'BEGIN_LON']/10
detail_data.loc[detail_data.END_LON < -180, 'END_LON']=detail_data.loc[detail_data.END_LON < -180, 'END_LON']/10

Fix inconsistencies in event type tagging

detail_data.loc[detail_data.EVENT_TYPE.isin(['HAIL FLOODING', 'HAIL/ICY ROADS']), 'EVENT_TYPE'] = 'Hail'
detail_data.loc[detail_data.EVENT_TYPE.isin(['Heavy Wind']), 'EVENT_TYPE'] = 'High Wind'
detail_data.loc[detail_data.EVENT_TYPE.isin(['High Snow']), 'EVENT_TYPE'] = 'Heavy Snow'
detail_data.loc[detail_data.EVENT_TYPE.isin(['High Snow']), 'EVENT_TYPE'] = 'Heavy Snow'
detail_data.loc[detail_data.EVENT_TYPE.isin(
    [
        'TORNADOES, TSTM WIND, HAIL',
        'THUNDERSTORM WINDS/FLOODING',
        'THUNDERSTORM WINDS/FLASH FLOOD',
        'THUNDERSTORM WINDS LIGHTNING',
        'THUNDERSTORM WIND/ TREES',
        'THUNDERSTORM WIND/ TREE',
        'THUNDERSTORM WINDS FUNNEL CLOU',
        'THUNDERSTORM WINDS/HEAVY RAIN',
        'THUNDERSTORM WINDS HEAVY RAIN',
        'THUNDERSTORM WINDS/ FLOOD'
    ]
), 'EVENT_TYPE'] = 'Thunderstorm Wind'
detail_data.loc[detail_data.EVENT_TYPE.isin(['TORNADO/WATERSPOUT']), 'EVENT_TYPE'] = 'Tornado'
detail_data.loc[detail_data.EVENT_TYPE.isin(['Landslide']), 'EVENT_TYPE'] = 'Debris Flow'
detail_data.loc[detail_data.EVENT_TYPE.isin(['Volcanic Ashfall']), 'EVENT_TYPE'] = 'Volcanic Ash'

Drop 14 rows tagged "Northern Lights" and a single row tagged 'OTHER'.

detail_data.drop(detail_data[detail_data.EVENT_TYPE.isin(['Northern Lights', 'OTHER'])].index, inplace=True)

Trim the data to just the Continental US¶

detail_data = detail_data[
        ((detail_data.BEGIN_LON > -124.7844079) &
        (detail_data.BEGIN_LON < -66.9513812) &
        (detail_data.BEGIN_LAT > 24.7433195) &
        (detail_data.BEGIN_LAT < 49.3457868)) |
        (detail_data.BEGIN_LAT.isnull())
    ].copy()

Create Categorical Index¶

Create two categories - one based off of the event type and another based on my not very scientific classification

detail_data.EVENT_TYPE = detail_data.EVENT_TYPE.astype("category")
detail_data['EVENT_CODE'] = detail_data.EVENT_TYPE.cat.codes

detail_data['META_TYPE'] = None
detail_data.loc[detail_data.EVENT_TYPE.isin(
    [
        'Avalanche',
        'Blizzard',
        'Cold/Wind Chill',
        'Extreme Cold/Wind Chill',
        'Freezing Fog',
        'Frost/Freeze',
        'Hail',
        'Heavy Snow',
        'Ice Storm',
        'Lake-Effect Snow',
        'Marine Hail',
        'Sleet',
        'Winter Storm',
        'Winter Weather'
    ]
), 'META_TYPE'] = 'Cold'

detail_data.loc[detail_data.EVENT_TYPE.isin(
    [
        'Dense Smoke',
        'Drought',
        'Excessive Heat',
        'Heat',
        'Volcanic Ash',
        'Wildfire'
    ]
), 'META_TYPE'] = 'Heat'

detail_data.loc[detail_data.EVENT_TYPE.isin(
    [
        'Debris Flow',
        'Dense Fog',
        'Lightning',
        'Marine Dense Fog',
        'Marine Lightning'
    ]
), 'META_TYPE'] = 'Other'

detail_data.loc[detail_data.EVENT_TYPE.isin(
    [
        'Astronomical Low Tide',
        'Coastal Flood',
        'Flash Flood',
        'Flood',
        'Heavy Rain',
        'High Surf',
        'Lakeshore Flood',
        'Rip Current',
        'Seiche',
        'Sneakerwave',
        'Storm Surge/Tide',
        'Tsunami',
        'Waterspout'
    ]
), 'META_TYPE'] = 'Water'

detail_data.loc[detail_data.EVENT_TYPE.isin(
    [
        'Dust Devil',
        'Dust Storm',
        'Funnel Cloud',
        'High Wind',
        'Hurricane',
        'Hurricane (Typhoon)',
        'Marine High Wind',
        'Marine Hurricane/Typhoon',
        'Marine Strong Wind',
        'Marine Thunderstorm Wind',
        'Marine Tropical Depression',
        'Marine Tropical Storm',
        'Strong Wind',
        'Thunderstorm Wind',
        'Tornado',
        'Tropical Depression',
        'Tropical Storm'
    ]
), 'META_TYPE'] = 'Wind'
detail_data.META_TYPE = detail_data.META_TYPE.astype("category")
detail_data['META_CODE'] = detail_data.META_TYPE.cat.codes

Fill in missing data¶

The US Census Gazetteer contains a file which provides Lat/Long for every county in the US. Combined with the FIPs codes, we can give estimated locations for events with no clear starting point (about 1/3 of the dataset - things like "drought" or "Strong Wind" that don't have a clearly defined geographic location). Dataset from https://www.census.gov/geo/maps-data/data/gazetteer2017.html

county_df = pd.read_csv('data/2017_Gaz_counties_national.txt', sep='\t', engine='python')
county_df['FIPS'] = county_df['GEOID'].apply(lambda x: '{0:0>5}'.format(x))
county_df.set_index("FIPS", inplace=True)
county_df.head(3)

Fill in Lat/Lon for events based on county centroid¶

Create a new column that combines the State and County FIPS so we can look them up (gets coordinates for ~250k entries)

detail_data['FIPS'] = detail_data['STATE_FIPS']*1000+detail_data['CZ_FIPS']
detail_data['FIPS'].fillna(0, inplace=True)
detail_data['FIPS'] = detail_data['FIPS'].apply(lambda x: '{0:0>5}'.format(int(x)))

# drop the old columns, since we've condensed that data into one column
detail_data = detail_data.drop(['EPISODE_ID', 'EVENT_ID', 'STATE_FIPS', 'CZ_TYPE', 'CZ_FIPS', 'CZ_TIMEZONE'], axis=1)

Try to match up FIPS ids for events with no location info, and fill in the begin lat/lon with that counties centroid

# temporaily assing FIPS as the index
detail_data.set_index("FIPS", inplace=True)

# left join on the county dataframe to get a column of county centroids
detail_data = detail_data.join(county_df[['INTPTLAT', 'INTPTLONG']], on="FIPS")

# reset the index to avoid duplicate issues in a sec 
detail_data.reset_index(inplace=True)

# if the lat/lon are blank, and we were able to match it to a FIPS code, add the country centroid as the event lat/lon
detail_data.loc[detail_data.BEGIN_LAT.isnull(), 'BEGIN_LAT'] = detail_data.INTPTLAT
detail_data.loc[detail_data.BEGIN_LON.isnull(), 'BEGIN_LON'] = detail_data.INTPTLONG

#drop the extra columns now that we don't need them
detail_data.drop(['INTPTLAT', 'INTPTLONG'], axis=1, inplace=True)

Fill in missing Date Entries¶

Some entries have an end date but no starting date. We're just going to fill in the latter with the former (~110k entries)

detail_data.loc[detail_data.BEGIN_DATE.isnull(), 'BEGIN_DATE'] = detail_data.END_DATE

# redundant after all the date messing earlier, but these turn out to be real handy
detail_data['MONTH'] = detail_data.BEGIN_DATE.dt.month
detail_data['DAY'] = detail_data.BEGIN_DATE.dt.day

TODO: look at filling in the remaining missing points with bp02oc18.dbx from https://www.weather.gov/gis/ZoneCounty

Drop rows without enough data to do anything with¶

detail_data = detail_data.drop(detail_data[
    (detail_data.BEGIN_LON.isnull()) |
    (detail_data.BEGIN_LAT.isnull()) |
    (detail_data.BEGIN_DATE.isnull())
].index)

Create the matricies and Plotting¶

# get a matrix based on a given data and a +/- date span on eithe side
def get_day_matrix(date,span=30,meta_type=None,event_type=None,coords=None,coord_range=None):
    data_set = []
    for year in range(1950,2017):
        # set the year in the date
        date = date.replace(year=year)

        # set the start and end based on the 
        start = date - timedelta(days=span)
        end = date + timedelta(days=span)
        
        
    
        year_data = detail_data[
            (detail_data.BEGIN_DATE>pd.to_datetime(start)) &
            (detail_data.BEGIN_DATE<pd.to_datetime(end))
        ]
        if meta_type is not None:
            year_data = year_data[(detail_data.META_TYPE == meta_type)]
        if event_type is not None:
            year_data = year_data[(detail_data.EVENT_TYPE == event_type)]
            
        if coords is not None:
            year_data = year_data[
                (year_data.BEGIN_LAT > coords[0]-coord_range) &
                (year_data.BEGIN_LAT < coords[0]+coord_range) &
                (year_data.BEGIN_LON > coords[1]-coord_range) &
                (year_data.BEGIN_LON < coords[1]+coord_range)
            ]
                
        data_set.append(year_data)

    data = pd.concat(data_set)

    m = lil_matrix((5783,2760), dtype=np.float16)

    for i, row in data.iterrows():
        # calculate the discount applied to the matrix value based on how many days off we are
        date = date.replace(year=row.BEGIN_DATE.year)
        days_off = abs((row.BEGIN_DATE-date).days)
        if days_off == 0:
            mat_val = 1
        else:
            mat_val = 1/(1+math.log(days_off))

        # is this a single point event or a line
        if pd.isnull(row.END_LAT) or pd.isnull(row.END_LON):
            # single point - plot a square
            col_id = lat_to_index(row.BEGIN_LAT)
            row_id = lon_to_index(row.BEGIN_LON)
            for r in range(row_id-3, row_id+4):
                for c in range(col_id-3, col_id+4):
                    try:
                        m[r,c] += mat_val
                    except:
                        print("bad row:",row)
        else:
            # we have a start and an end, so it's a line
            start = (lat_to_index(row.BEGIN_LAT),lon_to_index(row.BEGIN_LON))
            end = (lat_to_index(row.END_LAT),lon_to_index(row.END_LON))
            for pos in bresenham_line(start,end):
                try:
                    m[pos[1],pos[0]] += mat_val
                except:
                    print("bad row (line):",pos)
    return m

# Examples:
#
# get_day_matrix(
#     datetime.strptime('11/15/18', '%m/%d/%y'),
#     event_type="Dense Fog"
# )

# get_day_matrix(
#     date, 
#     span=30, 
#     meta_type="Cold", 
#     event_type=None,
#     coords=coords,
#     coord_range=2.5
# )

# creates a Geopandas Dataframe from the matrix output of get_day_matrix
def create_geopandas_from_matrix(m):
    # get the row and col arrays from the non zero elements of the matrix
    row,col = m.nonzero()
    # iterate through the arrays and convert them to their lat/lon approximations
    data=[]
    for x in range(len(row)):
        data.append({
            "lat":index_to_lat(col[x]),
            "lon":index_to_lon(row[x]),
            "x":m[row[x],col[x]]
        })
    # create a pandas dataframe from the data, then convert it to a geopandas dataframe
    data = DataFrame(data)
    if len(data) > 0:
        data['Coordinates'] = list(zip(data.lon, data.lat))
        data['Coordinates'] = data['Coordinates'].apply(Point)
        return gpd.GeoDataFrame(data, geometry='Coordinates')
    else:
        return None

Plotting¶

load in the us state map (from https://www.arcgis.com/home/item.html?id=b07a9393ecbd430795a6f6218443dccc)

def get_state_shapemap():
    us_map = gpd.read_file("data/states_shape/states.shp")
    return us_map[~us_map.STATE_NAME.isin(['Hawaii',"Alaska"])].copy()

load in the us county map (from https://www.census.gov/geo/maps-data/data/cbf/cbf_counties.html)

def get_county_shapemap():
    us_county_map = gpd.read_file("data/shape/us_counties/cb_2017_us_county_500k.shp")
    us_county_map.STATEFP = us_county_map.STATEFP.astype(int)
    # constrain the county map to the continental us
    return us_county_map[
        (us_county_map.STATEFP<60) &
        (us_county_map.STATEFP!=2) &
        (us_county_map.STATEFP!=15)
    ]

# create the distance weights
wq = pysal.weights.DistanceBand.from_dataframe(get_county_shapemap(), threshold=3)
# weigh each county based on the total number of bordering counties
wq.transform = 'r'

%%time
#DEMO PLOT - County plot by straight count
date = datetime.strptime('04/15/18', '%m/%d/%y')
m = get_day_matrix(date, span=10, event_type="Tornado")
data = create_geopandas_from_matrix(m)

us_county_map = get_county_shapemap()

data.crs = us_county_map.crs
county_data = sjoin(data, us_county_map, how='left')
us_county_map.set_index("GEOID", inplace=True)
us_county_map['counts'] = county_data.groupby("GEOID")['x'].sum()
us_county_map.counts.fillna(0, inplace=True)

ax = us_county_map.plot(column="counts", figsize=(5,5))

# We can now plot our GeoDataFrame.
plt.title("Points Condensed into Counties")
plt.show()

# make a gettis ord g plot provided a dataframe from 'create_geopandas_from_matrix(m)'
def make_gettis_ord_g_plot(data, title="Gettis Ord G* Autocorrelation", filename=None, color_map="GnBu", ret_values=False):
    # get the county shapemap
    us_county_map = get_county_shapemap()

    # join all of the data into the county dataframe
    data.crs = us_county_map.crs
    county_data = sjoin(data, us_county_map, how='left')
    us_county_map.set_index("GEOID", inplace=True)
    us_county_map['counts'] = county_data.groupby("GEOID")['x'].sum()
    us_county_map.counts.fillna(0, inplace=True)
    
    # calculate the gettis ord gi of each county based on the data
    us_county_map['counts_g_local'] = esda.getisord.G_Local(us_county_map['counts'], wq, permutations=5).Zs

    # plot the result
    f, ax = plt.subplots(1, figsize=(20,20))
    us_county_map.plot(column='counts_g_local', cmap=color_map, linewidth=0.1, ax=ax, edgecolor='black')
    ax.set_axis_off()
    
    # If there's a filename, just save it, if not, plot it to the notebook
    plt.title(title)
    if filename is not None:
        plt.savefig(filename)
        plt.close()
    else:
        plt.show()
        
    # in case I want to get the results back
    if ret_values:
        return us_county_map

%%time
make_gettis_ord_g_plot(data, title="Gettis Ord G* Autocorrelation - Tornado - 04/15", color_map="Reds")

# get just the getti ord gi score for a given dataframe from 'create_geopandas_from_matrix(m)'
def get_gettis_ord_g_scores(data):
    # get the county shapemap
    us_county_map = get_county_shapemap()

    # join all of the data into the county dataframe
    data.crs = us_county_map.crs
    county_data = sjoin(data, us_county_map, how='left')
    us_county_map.set_index("GEOID", inplace=True)
    us_county_map['counts'] = county_data.groupby("GEOID")['x'].sum()
    us_county_map.counts.fillna(0, inplace=True)
    
    return esda.getisord.G_Local(us_county_map['counts'], wq, permutations=5).Zs

%%timeit
get_gettis_ord_g_scores(data)

Create Gettis Ord Gi Dataframes¶

def get_spatial_lag_quartiles(data, title="Spatial Lag Median Risk Quartiles", filename=None, color_map="GnBu", ret_values=False):
    us_county_map = get_county_shapemap()

    data.crs = us_county_map.crs
    county_data = sjoin(data, us_county_map, how='left')
    us_county_map.set_index("GEOID", inplace=True)
    us_county_map['counts'] = county_data.groupby("GEOID")['x'].sum()
    us_county_map.counts.fillna(0, inplace=True)

    # create a list of counties and all the counties that county borders from the geometry of the shape file
    # wq = pysal.weights.Queen.from_dataframe(us_county_map)
    wq = pysal.weights.DistanceBand.from_dataframe(us_county_map, threshold=2, binary=True)
    # weigh each county based on the total number of bordering counties
    wq.transform = 'r'

    # get the spatial lag for each county - based on all of the attrubute values we see in neighboring counties
    y = us_county_map['counts']
    ylag = pysal.weights.lag_spatial(wq,y)

    f, ax = plt.subplots(1, figsize=(20,20))
    us_county_map.assign(cl=ylag).plot(column='cl', categorical=True,
            k=5, cmap=color_map, linewidth=0.1, ax=ax,
            edgecolor='black')
    ax.set_axis_off()
    
    plt.title(title)
    if filename is not None:
        plt.savefig(filename)
        plt.close()
    else:
        plt.show()
    
    if ret_values:
        return us_county_map.assign(cl=ylag)

# restricting predictions to these types at the moment
event_types = ['Blizzard', 'Extreme Cold/Wind Chill', 'Hail', 'Heavy Snow', 'Ice Storm', 'Winter Storm','Drought', 
    'Excessive Heat', 'Wildfire', 'Flash Flood', 'Heavy Rain', 'High Wind', 'Hurricane', 'Thunderstorm Wind', 'Tornado']

# running overnight - tack on the others to run after the main ones
# event_types = event_types + [e for e in detail_data.EVENT_TYPE.unique() if e not in event_types]

%%time

for event_type in event_types:
    date = datetime.strptime('01/01/18', '%m/%d/%y')
    # set up the county data 
    county_data = get_county_shapemap()
    county_data.set_index("GEOID", inplace=True)
    
    while date.year<2019:
        print("Plotting {} - {}".format(event_type, str(date).split()[0]))

        # make a matrix for this day, using events +/- 2 weeks
        m = get_day_matrix(date, span=14, event_type=event_type)
        # throw that data into a geopandas frame
        data = create_geopandas_from_matrix(m)
        if data is None:
            print("No data found for", date)
            county_data["{}-{:%m-%d}_g_score".format(re.sub(r'\W+', '', event_type), date)] = 0
        else:
            vals = get_gettis_ord_g_scores(data)
            county_data["{}-{:%m-%d}_g_score".format(re.sub(r'\W+', '', event_type), date)] = vals

        date = date + timedelta(days=7)        
    # save it
    county_data.to_file("output/{}_weekly_g_score.geoJSON".format(re.sub(r'\W+', '', event_type)))

Make Predictions¶

# restricting predictions to these types at the moment
event_types = ['Blizzard', 'Extreme Cold/Wind Chill', 'Hail', 'Heavy Snow', 'Ice Storm', 'Winter Storm','Drought', 
    'Excessive Heat', 'Wildfire', 'Flash Flood', 'Heavy Rain', 'High Wind', 'Hurricane', 'Thunderstorm Wind', 'Tornado']

Getting labeled events

# super simple - find an event at random that occured after 2016
def get_positive_event():  
    event = detail_data[detail_data.BEGIN_DATE > '2016-01-01'].sample()
    return event.EVENT_TYPE.values[0], event.META_TYPE.values[0], (event.BEGIN_LAT.values[0], event.BEGIN_LON.values[0]), event.BEGIN_DATE.values[0]

# trickier - we need to pick a location and date at random, then check to make sure nothing happened 
# if it did, we guess again until we come up with nothing
def get_negative_event(max_range=1.5, day_range=10):
    event_count = 99
    event_type = random.choice(event_types)
    while event_count != 0:
        event_count = 0
        # guess a random lat/long combo in the continental us
        lat = random.uniform(24.7433195, 49.3457868)
        lon = random.uniform(-124.7844079, -66.9513812)
        
        # guess a random date (days from 1-28 so I don't have to worry about Feb)
        search_date = datetime(year=2016, month=random.randint(1,12), day=random.randint(1,28))
        
        # get all events from the dataset withing +/- max_range of that location
        nearby_events = detail_data[
            ((detail_data.BEGIN_LON > lon-max_range) &
            (detail_data.BEGIN_LON < lon+max_range) &
            (detail_data.BEGIN_LAT > lat-max_range) &
            (detail_data.BEGIN_LAT < lat+max_range)) &
            (detail_data.EVENT_TYPE==event_type)
        ].copy()
        
        # check every year in the dataset and count how many hits we get
        # TODO: There's got to be a simpler way of doing this
        for year in range(1950, 2017):
            search_date = search_date.replace(year=year)
            event_count += len(nearby_events[
                ((nearby_events.BEGIN_DATE >= (search_date - timedelta(days=day_range))) &
                (nearby_events.BEGIN_DATE <= (search_date + timedelta(days=day_range))))
            ])
#         print("{} {} events at ({},{}) within {} days of {}".format(event_count, event_type, lat, lon, day_range, search_date))
    return event_type, "", (lat, lon), search_date

Prediction Approach #1

rolling = 7
threshold = 0.01

def predict(event_type, coords, date):
    # load in the geopandas file with the g scores
    data_g_scores = gpd.read_file("output/{}_weekly_g_score.geoJSON".format(re.sub(r'\W+', '', event_type)))
    
    # get the data for the specific county these coordinates fall in 
    county_level_data = data_g_scores[data_g_scores.geometry.contains(Point(coords[1],coords[0]))]
    
    # not sure whey this is happening
    if len(county_level_data) == 0:
        return False, 0
    
    # screwy, but this works. Get an array of values by week
    county_data = county_level_data.iloc[:,8:-1].transpose().iloc[:,0].values
    
    week_of_year = date.isocalendar()[1]
    value = pd.Series(county_data).rolling(rolling).mean().values[week_of_year]
    
    return value > threshold, value

%%time
tries = 10
results = []

# check positive events
for x in range(tries):
    event_type = ""
    while event_type not in event_types:
        event_type, meta_type, coords, date = get_positive_event()
    # from https://stackoverflow.com/a/29753985
    date = datetime.utcfromtimestamp(date.tolist()/1e9)
    try:
        prediction = predict(event_type, coords, date)
        results.append({
            "truth":True,
            "prediction":prediction[0],
            "p_value":prediction[1],
            "event_type":event_type,
            "coords":coords,
            "date":date
        })
    except:
        print("ERR (POS):",event_type, coords, date)

# check negative events
for x in range(tries):
    event_type = ""
    while event_type not in event_types:
        event_type, meta_type, coords, date = get_negative_event()
    try:
        prediction = predict(event_type, coords, date)
        results.append({
            "truth":False,
            "prediction":prediction[0],
            "p_value":prediction[1],
            "event_type":event_type,
            "coords":coords,
            "date":date
        })
    except:
        print("ERR (NEG):",event_type, coords, date)

Wall time: 17.3 s

DataFrame(results)

Prediction Approach #2

day_range=21
max_range=1.5
threshold = 0.75

results = []

conf={"tt":0, "tf":0, "ft":0, "ff":0}

for x in range(3):
    lat = random.uniform(24.7433195, 49.3457868)
    lon = random.uniform(-124.7844079, -66.9513812)
    correct = 0
    print(x)

    
    for event_type in event_types:
        try:
            likely = True

            data_g_scores = gpd.read_file("output/{}_weekly_g_score.geoJSON".format(re.sub(r'\W+', '', event_type)))

            # get the data for the county from this data set
            county_level_data = data_g_scores[data_g_scores.geometry.contains(Point(lon,lat))]

            # screwy, but gets us a week number for our predictions
            county_level_data = county_level_data.iloc[:,9:-1].transpose()
            county_level_data.columns = ['value']
            county_level_data.reset_index(inplace=True)
            predicted_weeks = county_level_data.sort_values("value", ascending=False).head(2).index.values

            max_like = county_level_data.sort_values("value", ascending=False).head(1).value.values[0]
            if max_like < threshold:
                likely=False

            # derive a date from the sunday of that week we can use to search
            search_date = datetime.strptime("2017-W{}".format(predicted_weeks[0]) + '-0', "%Y-W%W-%w")

            # filter the detail data down by distance and date to see if any of this kind of event are within our window
            prediction_set = detail_data[
                        ((detail_data.BEGIN_LON > lon-max_range) &
                        (detail_data.BEGIN_LON < lon+max_range) &
                        (detail_data.BEGIN_LAT > lat-max_range) &
                        (detail_data.BEGIN_LAT < lat+max_range)) &
                        (detail_data.EVENT_TYPE==event_type)
                    ].copy()

            prediction_set = prediction_set[
                (prediction_set.BEGIN_DATE >= (search_date - timedelta(days=day_range))) &
                (prediction_set.BEGIN_DATE <= (search_date + timedelta(days=day_range)))
            ]

            if (len(prediction_set) > 0 and likely) or (len(prediction_set) == 0 and not likely):
                if len(prediction_set) > 0 and likely: conf['tt'] += 1
                else: conf['ff'] += 1
                    
                correct += 1
                print("Succcess - {} near {} on {} (Guess:{})".format(event_type, (lat,lon), search_date, likely))
            else:
                if likely: conf['tf'] += 1
                else: conf['ft'] += 1
                    
                print("Fail - {} near {} on {} (Guess:{})".format(event_type, (lat,lon), search_date, likely))
                pass
        except:
            continue

    print("Total Correct:{}/{}".format(correct,len(event_types)))
    results.append({
        "lat":lat,
        "lon":lon,
        "correct":correct
    })

Old Code I don't want to throw away¶

But I probably should

# Get a matrix based on the start and end dates provided
# can also specift a meta type or an event type
def get_matrix(start, end, meta_type=None, event_type=None):
    if meta_type is None and event_type is None:
        # filter the detail data by those two dates and add it to the set
        data = detail_data[
            (detail_data.BEGIN_DATE>pd.to_datetime(start)) &
            (detail_data.BEGIN_DATE<pd.to_datetime(end))
        ]
    else:
        if meta_type is not None:
            data = detail_data[
                (detail_data.BEGIN_DATE>pd.to_datetime(start)) &
                (detail_data.BEGIN_DATE<pd.to_datetime(end)) &
                (detail_data.META_TYPE == meta_type)
            ]
        elif event_type is not None:
            data = detail_data[
                (detail_data.BEGIN_DATE>pd.to_datetime(start)) &
                (detail_data.BEGIN_DATE<pd.to_datetime(end)) &
                (detail_data.EVENT_TYPE == event_type)
            ]

    m = lil_matrix((5783,2760), dtype=np.float16)

    for i, row in data.iterrows():
        mat_val = 1


        # is this a single point event or a line
        if pd.isnull(row.END_LAT) or pd.isnull(row.END_LON):
            # single point - plot a square
            col_id = lat_to_index(row.BEGIN_LAT)
            row_id = lon_to_index(row.BEGIN_LON)
            for r in range(row_id-3, row_id+4):
                for c in range(col_id-3, col_id+4):
                    try:
                        m[r,c] += mat_val
                    except:
                        print("bad row:",row)
        else:
            # we have a start and an end, so it's a line
            start = (lat_to_index(row.BEGIN_LAT),lon_to_index(row.BEGIN_LON))
            end = (lat_to_index(row.END_LAT),lon_to_index(row.END_LON))
            for pos in bresenham_line(start,end):
                try:
                    m[pos[1],pos[0]] += mat_val
                except:
                    print("bad row (line):",pos)
    return m

def get_year_month_matrix(year, month):
    start = "{:0d}-{:02d}-01".format(year, month)
    if month == 12:
        month = 1
        year += 1
    else:
        month+=1
    end = "{:0d}-{:02d}-01".format(year, month)
    year_data = detail_data[(detail_data.BEGIN_DATE>start) & (detail_data.BEGIN_DATE<end)].dropna()
    # trim it to just the continental US
    year_data = year_data[
        (year_data.BEGIN_LON > -124.7844079) &
        (year_data.BEGIN_LON < -66.9513812) &
        (year_data.BEGIN_LAT > 24.7433195) &
        (year_data.BEGIN_LAT < 49.3457868)
    ]
    m = lil_matrix((5783,2760), dtype=np.int16)
    for i, row in year_data.iterrows():
        if row.END_LAT==np.nan or row.END_LON==np.nan:
            col_id = lat_to_index(row.BEGIN_LAT)
            row_id = lon_to_index(row.BEGIN_LON)
            for r in range(row_id-3, row_id+4):
                for c in range(col_id-3, col_id+4):
                    try:
                        m[r,c] = 1
                    except:
                        print("bad row:",row)
        else:
            # we have a start and an end, so it's a line
            start = (lat_to_index(row.BEGIN_LAT),lon_to_index(row.BEGIN_LON))
            end = (lat_to_index(row.END_LAT),lon_to_index(row.END_LON))
            for pos in bresenham_line(start,end):
                try:
                    m[pos[1],pos[0]] = 1
                except:
                    print("bad row:",pos)
    return m

%%time
county_data = get_county_shapemap()
county_data.set_index('GEOID', inplace=True)

start_date = datetime.strptime('01/01/1950', '%m/%d/%Y')
end_date = datetime.strptime('01/01/2019', '%m/%d/%Y')

for event_type in [x for x in detail_data.EVENT_TYPE.unique() if x not in ["Avalanche", "Blizzard", "Flash Flood", "Hail", "Thunderstorm Wind", "Tornado", "Winter Storm"]]:
    print(event_type)
    m = get_matrix(start_date, end_date, event_type=event_type)
    data = create_geopandas_from_matrix(m)
    print("Creating plots from {} entries".format(len(data)))
    val = get_spatial_lag_quartiles(
        data, 
        title="Spatial Lag Median Risk Quartiles - {}".format(event_type),
        filename="event_output/spatial_lag_quartile_{0}.png".format(event_type.replace('/',' ')),
        ret_values=True)
    county_data[event_type+"_quartile"] = val.cl
    val = get_gettis_ord_g(
        data, 
        title="Gettis Ord G* Autocorrelation - {}".format(event_type), 
        filename="event_output/gettis_ord_g_{0}.png".format(event_type.replace('/',' ')),
        ret_values=True)
    county_data[event_type+"_g_local"] = val.counts_g_local
    county_data.to_csv("county_overall_output.csv")

%%time
meta_type_colors = {
    'Cold':"Blues",
    'All':"Greys",
    'Wind':"BuGn",
    'Water':"PuBu",
    'Other':"Greens",
    'Heat':"YlOrRd"
}

county_data = get_county_shapemap()
county_data.set_index("GEOID", inplace=True)

for meta_type, color_map in meta_type_colors.items():
    date = datetime.strptime('01/01/18', '%m/%d/%y')
    while date.year<2019:
        print("Plotting {} - {}".format(meta_type, str(date).split()[0]))
        if meta_type == "All":
            m = get_day_matrix(date, span=14)
        else:
            m = get_day_matrix(date, span=14, meta_type=meta_type)
        data = create_geopandas_from_matrix(m)
        if data is None:
            print("No data found for", date)
        else:
            val = get_gettis_ord_g(
                data,
                title="Gettis Ord G* Autocorrelation - {} - ({})".format(meta_type, str(date).split()[0]), 
                filename="temp.png",
                color_map=color_map,
                ret_values=True
            )
            county_data["{}-{:%m-%d}_p_sim".format(meta_type, date)] = val.counts_p_sim
            county_data.to_csv("weekly_p_sim.csv")
        date = date + timedelta(days=7)

%%time

county_data = get_county_shapemap()
county_data.set_index("GEOID", inplace=True)

while date.year<2019:
    print("Plotting {} - {}".format(meta_type, str(date).split()[0]))
    m = get_day_matrix(date, span=14, event_type='Tornado')
    data = create_geopandas_from_matrix(m)
    if data is None:
        print("No data found for", date)
    else:
        val = get_gettis_ord_g(
            data,
            title="Gettis Ord G* Autocorrelation - {} - ({})".format(meta_type, str(date).split()[0]), 
            filename="temp.png",
            color_map=color_map,
            ret_values=True
        )
        county_data["{}-{:%m-%d}_p_sim".format(meta_type, date)] = val.counts_p_sim
        county_data.to_csv("tornado_p_sim.csv")
    date = date + timedelta(days=7)

drought_p_sim = pd.read_csv("tornado_p_sim.csv")
drought_p_sim.set_index("GEOID", inplace=True)

drought_p_sim.loc[48221][9:].astype(float).values

geo_id = 29095
rolling = 4
date = datetime.strptime('01/01/18', '%m/%d/%y')
date_list = ["{:%m-%d}".format(date+timedelta(days=x)) for x in range(0, 365, 7)]

f, ax = plt.subplots(1, figsize=(20,5))
sns.lineplot(
    y=running_mean(drought_p_sim.loc[geo_id][9:].astype(float).values, rolling), 
    x=date_list[:-rolling+1], palette="RdGy")

# ax.legend(['Drought Likelyhood'])

plt.xticks(rotation=90)
plt.title("Tornado Likelyhood for 39.0997° N, 94.5786° W (Kansas City, MO)")
plt.savefig("kc_plot.png")
plt.show()

weekly_p_sim = pd.read_csv("weekly_p_sim.csv")
weekly_p_sim.set_index("GEOID", inplace=True)

cold_p_sim = weekly_p_sim.iloc[:,9:62].copy()
all_p_sim = weekly_p_sim.iloc[:,62:115].copy()
wind_p_sim = weekly_p_sim.iloc[:,115:168].copy()
water_p_sim = weekly_p_sim.iloc[:,168:221].copy()
other_p_sim = weekly_p_sim.iloc[:,221:274].copy()
heat_p_sim = weekly_p_sim.iloc[:,274:327].copy()

sns.set()
geo_id = 26161
rolling = 3
date = datetime.strptime('01/01/18', '%m/%d/%y')
date_list = ["{:%m-%d}".format(date+timedelta(days=x)) for x in range(0, 365, 7)]

f, ax = plt.subplots(1, figsize=(20,5))
sns.lineplot(y=running_mean(cold_p_sim.loc[geo_id].values, rolling), x=date_list[:-rolling+1])
sns.lineplot(y=running_mean(wind_p_sim.loc[geo_id].values, rolling), x=date_list[:-rolling+1])
sns.lineplot(y=running_mean(water_p_sim.loc[geo_id].values, rolling), x=date_list[:-rolling+1])
sns.lineplot(y=running_mean(heat_p_sim.loc[geo_id].values, rolling), x=date_list[:-rolling+1])

ax.legend(['Cold','Wind','Water','Heat'])
plt.xticks(rotation=90)

plt.savefig("aa_plot.png")
plt.show()

sns.set()
geo_id = 40109
rolling = 3
date = datetime.strptime('01/01/18', '%m/%d/%y')
date_list = ["{:%m-%d}".format(date+timedelta(days=x)) for x in range(0, 365, 7)]

f, ax = plt.subplots(1, figsize=(20,5))
sns.lineplot(y=running_mean(cold_p_sim.loc[geo_id].values, rolling), x=date_list[:-rolling+1])
sns.lineplot(y=running_mean(wind_p_sim.loc[geo_id].values, rolling), x=date_list[:-rolling+1])
sns.lineplot(y=running_mean(water_p_sim.loc[geo_id].values, rolling), x=date_list[:-rolling+1])
sns.lineplot(y=running_mean(heat_p_sim.loc[geo_id].values, rolling), x=date_list[:-rolling+1])

ax.legend(['Cold','Wind','Water','Heat'])
plt.xticks(rotation=90)
plt.title("Annual Weather Type Risks for 35.4676° N, 97.5164° W (Oklahoma City, OK)")
plt.savefig("ok_plot.png")
plt.show()

rolling = 5
date = datetime.strptime('01/01/18', '%m/%d/%y')
date_list = [date+timedelta(days=x) for x in range(0, 365, 7)][:-rolling+1]

def predict(geo_id, date):
    date = date.replace(year=2018)
    p_sims = DataFrame({
        "Cold":running_mean(cold_p_sim.loc[geo_id].values, rolling), 
        "Wind":running_mean(wind_p_sim.loc[geo_id].values, rolling),
        "Water":running_mean(water_p_sim.loc[geo_id].values, rolling),
        "Heat":running_mean(heat_p_sim.loc[geo_id].values, rolling),
        "date":date_list})

    start = date - timedelta(days=4)
    end = date + timedelta(days=4)
    predictions = p_sims[
        (p_sims.date >= start) &
        (p_sims.date <= end)
    ].mean().sort_values(ascending=False)
    
    return predictions

date = datetime.strptime('05/24/18', '%m/%d/%y')
geo_id = 36111

predictions = predict(geo_id, date)

	BEGIN_YEARMONTH	BEGIN_DAY	BEGIN_TIME	END_YEARMONTH	END_DAY	END_TIME	EPISODE_ID	EVENT_ID	STATE	STATE_FIPS	...	END_AZIMUTH	END_LOCATION	BEGIN_LAT	BEGIN_LON	END_LAT	END_LON	EPISODE_NARRATIVE	EVENT_NARRATIVE	DATA_SOURCE
0	195004	28	1445	195004	28	1445	NaN	10096222	OKLAHOMA	40.0	...	NaN	NaN	35.12	-99.20	35.17	-99.20	NaN	NaN	PUB
1	195004	29	1530	195004	29	1530	NaN	10120412	TEXAS	48.0	...	NaN	NaN	31.90	-98.60	31.73	-98.60	NaN	NaN	PUB
2	195007	5	1800	195007	5	1800	NaN	10104927	PENNSYLVANIA	42.0	...	NaN	NaN	40.58	-75.70	40.65	-75.47	NaN	NaN	PUB
3	195007	5	1830	195007	5	1830	NaN	10104928	PENNSYLVANIA	42.0	...	NaN	NaN	40.60	-76.75	NaN	NaN	NaN	NaN	PUB
4	195007	24	1440	195007	24	1440	NaN	10104929	PENNSYLVANIA	42.0	...	NaN	NaN	41.63	-79.68	NaN	NaN	NaN	NaN	PUB

	USPS	GEOID	ANSICODE	NAME	ALAND	AWATER	ALAND_SQMI	AWATER_SQMI	INTPTLAT	INTPTLONG
FIPS
01001	AL	1001	161526	Autauga County	1539614693	25744269	594.449	9.940	32.532237	-86.646440
01003	AL	1003	161527	Baldwin County	4117605847	1133109409	1589.817	437.496	30.659218	-87.746067
01005	AL	1005	161528	Barbour County	2292144656	50538698	885.002	19.513	31.870253	-85.405104

	coords	date	event_type	p_value	prediction	truth
0	(44.288999999999994, -105.3636)	2016-08-07 18:55:00	Tornado	1.164462	True	True
1	(39.22, -94.94)	2017-06-16 23:18:00	Thunderstorm Wind	0.485024	True	True
2	(36.29, -91.57)	2017-03-01 03:05:00	Thunderstorm Wind	0.064905	True	True
3	(48.01, -98.0)	2016-06-10 02:00:00	Thunderstorm Wind	-0.451598	False	True
4	(40.26, -79.18)	2016-06-03 08:26:00	Flash Flood	0.239996	True	True
5	(37.939479999999996, -76.94187099999999)	2016-04-02 23:00:00	High Wind	0.036513	True	True
6	(38.14, -93.75)	2017-08-20 17:25:00	Thunderstorm Wind	-0.003817	False	True
7	(39.0476, -83.0712)	2017-05-27 18:15:00	Flash Flood	0.503592	True	True
8	(40.61, -102.47)	2016-06-27 16:30:00	Tornado	1.944154	True	True
9	(38.9, -94.37)	2016-05-26 14:32:00	Hail	0.423079	True	True
10	(49.16748863798082, -87.3481398519664)	2016-09-10 00:00:00	Winter Storm	0.000000	False	False
11	(32.59063113488605, -68.3310720957892)	2016-10-13 00:00:00	Ice Storm	0.000000	False	False
12	(24.903452881324505, -101.40765439450563)	2016-11-17 00:00:00	Tornado	0.000000	False	False
13	(25.661652087001055, -90.19885645193351)	2016-09-11 00:00:00	Heavy Rain	0.000000	False	False
14	(36.25214051698068, -87.82126043135372)	2016-11-08 00:00:00	Winter Storm	-0.206816	False	False
15	(48.36028341662422, -87.30865525580604)	2016-01-06 00:00:00	Excessive Heat	0.000000	False	False
16	(30.538960204993828, -84.3035844545294)	2016-11-08 00:00:00	Ice Storm	-0.092268	False	False
17	(37.73191305104431, -81.41733360871362)	2016-04-20 00:00:00	Ice Storm	-0.011516	False	False
18	(47.31973834891245, -70.13225918619943)	2016-05-21 00:00:00	High Wind	0.000000	False	False
19	(36.40887967930723, -69.77530881319103)	2016-03-09 00:00:00	Heavy Snow	0.000000	False	False