Complete visualizations for Milestone 4

2023-03-25 15:54:16 -07:00 · 2023-03-25 15:54:16 -07:00 · db02382154
parent 4566009cae
commit db02382154
20 changed files with 5922 additions and 11134 deletions
--- a/analysis/analysis1.ipynb
+++ b/analysis/analysis1.ipynb
--- a/analysis/code/project_functions1.py
+++ b/analysis/code/project_functions1.py
@ -1,3 +1,5 @@
+import numpy as np
+from math import pi
 import pandas as pd

 def load_and_process():
@ -30,11 +32,12 @@ def load_and_process():

    pol = (
        pd.read_csv("../data/raw/countypres_2000-2020.csv")
-        .query("`year` == 2012")
+        .query("`year` == 2012 & `party` == 'DEMOCRAT'")
        .reset_index()
        .drop([
            "year", "state", "county_fips", "office",
            "candidate", "version", "mode", "index",
+            "party"
        ], axis="columns")
        .rename({
            "county_name": "county",
@ -42,28 +45,102 @@ def load_and_process():
            "candidatevotes": "votes",
            "totalvotes": "total"
        }, axis="columns")
-        .apply(lambda x: x.str.capitalize() if x.name == "county" or x.name == "party" else x)
+        .apply(lambda x: x.str.capitalize() if x.name == "county" else x)
        .apply(combine_name_state, axis="columns")
        .merge(counties, left_on="county", right_on="name")
        .drop(["state", "name"], axis="columns")
        .assign(percent=lambda x: x.votes/x.total)
+        .drop(["votes", "total"], axis="columns")
    )

    ## gb - the gaybourhoods dataset
    gb = (
        pd.read_csv("../data/raw/gaybourhoods.csv")
-        .merge(cords, left_on="GEOID10", right_on="ZIP") \
+        .merge(cords, left_on="GEOID10", right_on="ZIP")
        .drop([
+            "Tax_Mjoint", "TaxRate_SS", "TaxRate_FF", "TaxRate_MM",
+            "Cns_RateSS", "Cns_RateFF", "Cns_RateMM", "CountBars",
            "Mjoint_MF", "Mjoint_SS", "Mjoint_FF", "Mjoint_MM",
            "Cns_TotHH", "Cns_UPSS", "Cns_UPFF", "Cns_UPMM",
            "ParadeFlag", "FF_Tax", "FF_Cns", "MM_Tax", "MM_Cns",
            "SS_Index_Weight", "Parade_Weight", "Bars_Weight",
-            "GEOID10", "ZIP",
-        ], axis="columns") \
+            "GEOID10", "ZIP", "FF_Index", "MM_Index",
+        ], axis="columns")
        .rename({
            "LAT": "lat",
            "LNG": "long",
        }, axis="columns")
    )
    
-    return (gb, pol, counties, cords)
+    def kinsify(index, **kwargs):
+        max_index = 25
+        if index < max_index/7:
+            return 0
+        elif index < max_index*2/7:
+            return 1
+        elif index < max_index*3/7:
+            return 2
+        elif index < max_index*4/7:
+            return 3
+        elif index < max_index*5/7:
+            return 4
+        elif index < max_index*6/7:
+            return 5
+        else:
+            return 6
+    
+    gb["kinsey"] = gb.SS_Index.apply(kinsify, axis="columns")
+   
+    percent_democrat = np.empty(len(gb.index))
+    neighbourhood_kinsey = np.empty(len(gb.index))
+    for i, row in gb.iterrows():
+        percent_democrat[i] = nearest_neighbour(pol, (row.long, row.lat), interval=.1).percent
+        neighbourhood_kinsey[i] = select_smallest_neighbourhood(gb, (row.long, row.lat), interval=.1).kinsey.mean()
+
+    gb["percent_democrat"] = pd.Series(data=percent_democrat)
+    gb["neighbourhood_kinsey"] = pd.Series(data=neighbourhood_kinsey)
+    
+    return (gb, pol, counties, cords)
+
+def select_region(df, left, right, bottom, top):
+    """
+    Takes a dataframe with columns `long` and `lat` corresponding to
+    coordinates and returns a subset of the dataframe containing only entries
+    between the given boundaries
+    """
+    return df[(df["long"] > left) & (df["long"] < right) & (df["lat"] > bottom) & (df["lat"] < top)]
+
+def select_smallest_neighbourhood(df, pos, interval=1, multiplier=1.5, expansion_limit=10):
+    subset = select_region(df, pos[0]-interval, pos[0]+interval, pos[1]-interval, pos[1]+interval)
+    cinterval = interval
+    while subset.count().lat == 0:
+        cinterval += interval
+        #interval *= multiplier
+        subset = select_region(df, pos[0]-cinterval, pos[0]+cinterval, pos[1]-cinterval, pos[1]+cinterval)
+    
+    return subset
+
+def nearest_neighbour(df, pos, interval=1, multiplier=1.5, expansion_limit=10):
+    """
+    Given a dataframe with columns `long` and `lat` corresponding to
+    coordinates and a `pos` pair of long/lat coordinates, determine the
+    coordinates of the nearest observation in the dataset by running the
+    following algorithm:
+    1. Find all points within (long+-interval, lat+-interval)
+    2. If there are no other points within the range, start from step 1 and
+    set interval *= multiplier
+    3. Calculate the distance between pos and each point in the interval
+    3. Return the point with the lowest distance that isn't pos
+    """
+    
+    subset = select_smallest_neighbourhood(df, pos, interval, multiplier, expansion_limit)
+    
+    subset = subset.assign(distance=distance(*pos, subset["lat"], subset["long"]))
+    return subset.sort_values("distance").reset_index().iloc[0]
+
+# Efficient implementation of the haversine formula
+# Source: https://stackoverflow.com/a/21623206
+def distance(lat1, lon1, lat2, lon2):
+    p = pi/180
+    a = 0.5 - np.cos((lat2-lat1)*p)/2 + np.cos(lat1*p) * np.cos(lat2*p) * (1-np.cos((lon2-lon1)*p))/2
+    return 12742 * np.arcsin(np.sqrt(a))
--- a/analysis/images/atlanta.png
+++ b/analysis/images/atlanta.png
--- a/analysis/images/austin.png
+++ b/analysis/images/austin.png
--- a/analysis/images/boston.png
+++ b/analysis/images/boston.png
--- a/analysis/images/chicago.png
+++ b/analysis/images/chicago.png
--- a/analysis/images/denver.png
+++ b/analysis/images/denver.png
--- a/analysis/images/houston.png
+++ b/analysis/images/houston.png
--- a/analysis/images/los-angeles.png
+++ b/analysis/images/los-angeles.png
--- a/analysis/images/miami.png
+++ b/analysis/images/miami.png
--- a/analysis/images/new-orleans.png
+++ b/analysis/images/new-orleans.png
--- a/analysis/images/new-york-adjusted.png
+++ b/analysis/images/new-york-adjusted.png
--- a/analysis/images/new-york.png
+++ b/analysis/images/new-york.png
--- a/analysis/images/philadelphia.png
+++ b/analysis/images/philadelphia.png
--- a/analysis/images/portland.png
+++ b/analysis/images/portland.png
--- a/analysis/images/san-francisco.png
+++ b/analysis/images/san-francisco.png
--- a/analysis/images/seattle.png
+++ b/analysis/images/seattle.png
--- a/analysis/images/washington-dc.png
+++ b/analysis/images/washington-dc.png
--- a/data/processed/election-2012.csv
+++ b/data/processed/election-2012.csv
--- a/data/processed/gaybourhoods-nat.csv
+++ b/data/processed/gaybourhoods-nat.csv