69 lines
2.3 KiB
Python
69 lines
2.3 KiB
Python
import pandas as pd
|
|
|
|
def load_and_process():
|
|
|
|
# cords - mapping zip codes to long/lat coordinates
|
|
cords = pd.read_csv("../data/raw/zip_lat_long.csv")
|
|
|
|
## counties - Relating US counties to their long/lat position on the Earth
|
|
# Combine the county name with the state code
|
|
def combine_name_state(row):
|
|
row["name"] = f"{row['name']} {row['STUSAB']}"
|
|
return row
|
|
|
|
counties = (
|
|
pd.read_csv("../data/raw/us-county-boundaries.csv", sep=";")
|
|
.rename({
|
|
"NAME": "name",
|
|
"INTPTLAT": "lat",
|
|
"INTPTLON": "long",
|
|
}, axis="columns")
|
|
.apply(combine_name_state, axis="columns")
|
|
.drop(["STUSAB"], axis="columns")
|
|
)
|
|
|
|
|
|
## pol - Election results from the 2012 American presidential election
|
|
def combine_name_state(row):
|
|
row["county"] = f"{row['county']} {row['state']}"
|
|
return row
|
|
|
|
pol = (
|
|
pd.read_csv("../data/raw/countypres_2000-2020.csv")
|
|
.query("`year` == 2012")
|
|
.reset_index()
|
|
.drop([
|
|
"year", "state", "county_fips", "office",
|
|
"candidate", "version", "mode", "index",
|
|
], axis="columns")
|
|
.rename({
|
|
"county_name": "county",
|
|
"state_po": "state",
|
|
"candidatevotes": "votes",
|
|
"totalvotes": "total"
|
|
}, axis="columns")
|
|
.apply(lambda x: x.str.capitalize() if x.name == "county" or x.name == "party" else x)
|
|
.apply(combine_name_state, axis="columns")
|
|
.merge(counties, left_on="county", right_on="name")
|
|
.drop(["state", "name"], axis="columns")
|
|
.assign(percent=lambda x: x.votes/x.total)
|
|
)
|
|
|
|
## gb - the gaybourhoods dataset
|
|
gb = (
|
|
pd.read_csv("../data/raw/gaybourhoods.csv")
|
|
.merge(cords, left_on="GEOID10", right_on="ZIP") \
|
|
.drop([
|
|
"Mjoint_MF", "Mjoint_SS", "Mjoint_FF", "Mjoint_MM",
|
|
"Cns_TotHH", "Cns_UPSS", "Cns_UPFF", "Cns_UPMM",
|
|
"ParadeFlag", "FF_Tax", "FF_Cns", "MM_Tax", "MM_Cns",
|
|
"SS_Index_Weight", "Parade_Weight", "Bars_Weight",
|
|
"GEOID10", "ZIP",
|
|
], axis="columns") \
|
|
.rename({
|
|
"LAT": "lat",
|
|
"LNG": "long",
|
|
}, axis="columns")
|
|
)
|
|
|
|
return (gb, pol, counties, cords) |