pride-data-analysis/analysis/code/project_functions1.py

import numpy as np
from math import pi
import pandas as pd

def load_and_process():
    
    # cords - mapping zip codes to long/lat coordinates
    cords = pd.read_csv("../data/raw/zip_lat_long.csv")
    
    ## counties - Relating US counties to their long/lat position on the Earth
    # Combine the county name with the state code
    def combine_name_state(row):
        row["name"] = f"{row['name']} {row['STUSAB']}"
        return row

    counties = (
        pd.read_csv("../data/raw/us-county-boundaries.csv", sep=";")
        .rename({
            "NAME": "name",
            "INTPTLAT": "lat",
            "INTPTLON": "long",
        }, axis="columns")
        .apply(combine_name_state, axis="columns")
        .drop(["STUSAB"], axis="columns")
    )
    

    ## pol - Election results from the 2012 American presidential election
    def combine_name_state(row):
        row["county"] = f"{row['county']} {row['state']}"
        return row

    pol = (
        pd.read_csv("../data/raw/countypres_2000-2020.csv")
        .query("`year` == 2012 & `party` == 'DEMOCRAT'")
        .reset_index()
        .drop([
            "year", "state", "county_fips", "office",
            "candidate", "version", "mode", "index",
            "party"
        ], axis="columns")
        .rename({
            "county_name": "county",
            "state_po": "state",
            "candidatevotes": "votes",
            "totalvotes": "total"
        }, axis="columns")
        .apply(lambda x: x.str.capitalize() if x.name == "county" else x)
        .apply(combine_name_state, axis="columns")
        .merge(counties, left_on="county", right_on="name")
        .drop(["state", "name"], axis="columns")
        .assign(percent=lambda x: x.votes/x.total)
        .drop(["votes", "total"], axis="columns")
    )

    ## gb - the gaybourhoods dataset
    gb = (
        pd.read_csv("../data/raw/gaybourhoods.csv")
        .merge(cords, left_on="GEOID10", right_on="ZIP")
        .drop([
            "Tax_Mjoint", "TaxRate_SS", "TaxRate_FF", "TaxRate_MM",
            "Cns_RateSS", "Cns_RateFF", "Cns_RateMM", "CountBars",
            "Mjoint_MF", "Mjoint_SS", "Mjoint_FF", "Mjoint_MM",
            "Cns_TotHH", "Cns_UPSS", "Cns_UPFF", "Cns_UPMM",
            "ParadeFlag", "FF_Tax", "FF_Cns", "MM_Tax", "MM_Cns",
            "SS_Index_Weight", "Parade_Weight", "Bars_Weight",
            "GEOID10", "ZIP", "FF_Index", "MM_Index",
        ], axis="columns")
        .rename({
            "LAT": "lat",
            "LNG": "long",
        }, axis="columns")
    )
    
    def kinsify(index, **kwargs):
        max_index = 25
        if index < max_index/7:
            return 0
        elif index < max_index*2/7:
            return 1
        elif index < max_index*3/7:
            return 2
        elif index < max_index*4/7:
            return 3
        elif index < max_index*5/7:
            return 4
        elif index < max_index*6/7:
            return 5
        else:
            return 6
    
    gb["kinsey"] = gb.SS_Index.apply(kinsify, axis="columns")
   
    percent_democrat = np.empty(len(gb.index))
    neighbourhood_kinsey = np.empty(len(gb.index))
    for i, row in gb.iterrows():
        percent_democrat[i] = nearest_neighbour(pol, (row.long, row.lat), interval=.1).percent
        neighbourhood_kinsey[i] = select_smallest_neighbourhood(gb, (row.long, row.lat), interval=.1).kinsey.mean()

    gb["percent_democrat"] = pd.Series(data=percent_democrat)
    gb["neighbourhood_kinsey"] = pd.Series(data=neighbourhood_kinsey)
    
    return (gb, pol, counties, cords)

def select_region(df, left, right, bottom, top):
    """
    Takes a dataframe with columns `long` and `lat` corresponding to
    coordinates and returns a subset of the dataframe containing only entries
    between the given boundaries
    """
    return df[(df["long"] > left) & (df["long"] < right) & (df["lat"] > bottom) & (df["lat"] < top)]

def select_smallest_neighbourhood(df, pos, interval=1, multiplier=1.5, expansion_limit=10):
    subset = select_region(df, pos[0]-interval, pos[0]+interval, pos[1]-interval, pos[1]+interval)
    cinterval = interval
    while subset.count().lat == 0:
        cinterval += interval
        #interval *= multiplier
        subset = select_region(df, pos[0]-cinterval, pos[0]+cinterval, pos[1]-cinterval, pos[1]+cinterval)
    
    return subset

def nearest_neighbour(df, pos, interval=1, multiplier=1.5, expansion_limit=10):
    """
    Given a dataframe with columns `long` and `lat` corresponding to
    coordinates and a `pos` pair of long/lat coordinates, determine the
    coordinates of the nearest observation in the dataset by running the
    following algorithm:
    1. Find all points within (long+-interval, lat+-interval)
    2. If there are no other points within the range, start from step 1 and
    set interval *= multiplier
    3. Calculate the distance between pos and each point in the interval
    3. Return the point with the lowest distance that isn't pos
    """
    
    subset = select_smallest_neighbourhood(df, pos, interval, multiplier, expansion_limit)
    
    subset = subset.assign(distance=distance(*pos, subset["lat"], subset["long"]))
    return subset.sort_values("distance").reset_index().iloc[0]

# Efficient implementation of the haversine formula
# Source: https://stackoverflow.com/a/21623206
def distance(lat1, lon1, lat2, lon2):
    p = pi/180
    a = 0.5 - np.cos((lat2-lat1)*p)/2 + np.cos(lat1*p) * np.cos(lat2*p) * (1-np.cos((lon2-lon1)*p))/2
    return 12742 * np.arcsin(np.sqrt(a))
Complete visualizations for Milestone 4 2023-03-25 22:54:16 +00:00			`import numpy as np`
			`from math import pi`
Move analysis1 chains to code.project_functions1 2023-03-20 18:46:45 +00:00			`import pandas as pd`

			`def load_and_process():`

			`# cords - mapping zip codes to long/lat coordinates`
			`cords = pd.read_csv("../data/raw/zip_lat_long.csv")`

			`## counties - Relating US counties to their long/lat position on the Earth`
			`# Combine the county name with the state code`
			`def combine_name_state(row):`
			`row["name"] = f"{row['name']} {row['STUSAB']}"`
			`return row`

			`counties = (`
			`pd.read_csv("../data/raw/us-county-boundaries.csv", sep=";")`
			`.rename({`
			`"NAME": "name",`
			`"INTPTLAT": "lat",`
			`"INTPTLON": "long",`
			`}, axis="columns")`
			`.apply(combine_name_state, axis="columns")`
			`.drop(["STUSAB"], axis="columns")`
			`)`


			`## pol - Election results from the 2012 American presidential election`
			`def combine_name_state(row):`
			`row["county"] = f"{row['county']} {row['state']}"`
			`return row`

			`pol = (`
			`pd.read_csv("../data/raw/countypres_2000-2020.csv")`
Complete visualizations for Milestone 4 2023-03-25 22:54:16 +00:00			.query("`year` == 2012 & `party` == 'DEMOCRAT'")
Move analysis1 chains to code.project_functions1 2023-03-20 18:46:45 +00:00			`.reset_index()`
			`.drop([`
			`"year", "state", "county_fips", "office",`
			`"candidate", "version", "mode", "index",`
Complete visualizations for Milestone 4 2023-03-25 22:54:16 +00:00			`"party"`
Move analysis1 chains to code.project_functions1 2023-03-20 18:46:45 +00:00			`], axis="columns")`
			`.rename({`
			`"county_name": "county",`
			`"state_po": "state",`
			`"candidatevotes": "votes",`
			`"totalvotes": "total"`
			`}, axis="columns")`
Complete visualizations for Milestone 4 2023-03-25 22:54:16 +00:00			`.apply(lambda x: x.str.capitalize() if x.name == "county" else x)`
Move analysis1 chains to code.project_functions1 2023-03-20 18:46:45 +00:00			`.apply(combine_name_state, axis="columns")`
			`.merge(counties, left_on="county", right_on="name")`
			`.drop(["state", "name"], axis="columns")`
			`.assign(percent=lambda x: x.votes/x.total)`
Complete visualizations for Milestone 4 2023-03-25 22:54:16 +00:00			`.drop(["votes", "total"], axis="columns")`
Move analysis1 chains to code.project_functions1 2023-03-20 18:46:45 +00:00			`)`

			`## gb - the gaybourhoods dataset`
			`gb = (`
			`pd.read_csv("../data/raw/gaybourhoods.csv")`
Complete visualizations for Milestone 4 2023-03-25 22:54:16 +00:00			`.merge(cords, left_on="GEOID10", right_on="ZIP")`
Move analysis1 chains to code.project_functions1 2023-03-20 18:46:45 +00:00			`.drop([`
Complete visualizations for Milestone 4 2023-03-25 22:54:16 +00:00			`"Tax_Mjoint", "TaxRate_SS", "TaxRate_FF", "TaxRate_MM",`
			`"Cns_RateSS", "Cns_RateFF", "Cns_RateMM", "CountBars",`
Move analysis1 chains to code.project_functions1 2023-03-20 18:46:45 +00:00			`"Mjoint_MF", "Mjoint_SS", "Mjoint_FF", "Mjoint_MM",`
			`"Cns_TotHH", "Cns_UPSS", "Cns_UPFF", "Cns_UPMM",`
			`"ParadeFlag", "FF_Tax", "FF_Cns", "MM_Tax", "MM_Cns",`
			`"SS_Index_Weight", "Parade_Weight", "Bars_Weight",`
Complete visualizations for Milestone 4 2023-03-25 22:54:16 +00:00			`"GEOID10", "ZIP", "FF_Index", "MM_Index",`
			`], axis="columns")`
Move analysis1 chains to code.project_functions1 2023-03-20 18:46:45 +00:00			`.rename({`
			`"LAT": "lat",`
			`"LNG": "long",`
			`}, axis="columns")`
			`)`

Complete visualizations for Milestone 4 2023-03-25 22:54:16 +00:00			`def kinsify(index, **kwargs):`
			`max_index = 25`
			`if index < max_index/7:`
			`return 0`
			`elif index < max_index*2/7:`
			`return 1`
			`elif index < max_index*3/7:`
			`return 2`
			`elif index < max_index*4/7:`
			`return 3`
			`elif index < max_index*5/7:`
			`return 4`
			`elif index < max_index*6/7:`
			`return 5`
			`else:`
			`return 6`

			`gb["kinsey"] = gb.SS_Index.apply(kinsify, axis="columns")`

			`percent_democrat = np.empty(len(gb.index))`
			`neighbourhood_kinsey = np.empty(len(gb.index))`
			`for i, row in gb.iterrows():`
			`percent_democrat[i] = nearest_neighbour(pol, (row.long, row.lat), interval=.1).percent`
			`neighbourhood_kinsey[i] = select_smallest_neighbourhood(gb, (row.long, row.lat), interval=.1).kinsey.mean()`

			`gb["percent_democrat"] = pd.Series(data=percent_democrat)`
			`gb["neighbourhood_kinsey"] = pd.Series(data=neighbourhood_kinsey)`

			`return (gb, pol, counties, cords)`

			`def select_region(df, left, right, bottom, top):`
			`"""`
			Takes a dataframe with columns `long` and `lat` corresponding to
			`coordinates and returns a subset of the dataframe containing only entries`
			`between the given boundaries`
			`"""`
			`return df[(df["long"] > left) & (df["long"] < right) & (df["lat"] > bottom) & (df["lat"] < top)]`

			`def select_smallest_neighbourhood(df, pos, interval=1, multiplier=1.5, expansion_limit=10):`
			`subset = select_region(df, pos[0]-interval, pos[0]+interval, pos[1]-interval, pos[1]+interval)`
			`cinterval = interval`
			`while subset.count().lat == 0:`
			`cinterval += interval`
			`#interval *= multiplier`
			`subset = select_region(df, pos[0]-cinterval, pos[0]+cinterval, pos[1]-cinterval, pos[1]+cinterval)`

			`return subset`

			`def nearest_neighbour(df, pos, interval=1, multiplier=1.5, expansion_limit=10):`
			`"""`
			Given a dataframe with columns `long` and `lat` corresponding to
			coordinates and a `pos` pair of long/lat coordinates, determine the
			`coordinates of the nearest observation in the dataset by running the`
			`following algorithm:`
			`1. Find all points within (long+-interval, lat+-interval)`
			`2. If there are no other points within the range, start from step 1 and`
			`set interval *= multiplier`
			`3. Calculate the distance between pos and each point in the interval`
			`3. Return the point with the lowest distance that isn't pos`
			`"""`

			`subset = select_smallest_neighbourhood(df, pos, interval, multiplier, expansion_limit)`

			`subset = subset.assign(distance=distance(*pos, subset["lat"], subset["long"]))`
			`return subset.sort_values("distance").reset_index().iloc[0]`

			`# Efficient implementation of the haversine formula`
			`# Source: https://stackoverflow.com/a/21623206`
			`def distance(lat1, lon1, lat2, lon2):`
			`p = pi/180`
			`a = 0.5 - np.cos((lat2-lat1)p)/2 + np.cos(lat1p) * np.cos(lat2p) (1-np.cos((lon2-lon1)*p))/2`
			`return 12742 * np.arcsin(np.sqrt(a))`