Complete task 1 in analysis1.ipynb

Some additional work in the process/wrangling phase may be necessary,
which will be determined in task 3.

Closes #14
This commit is contained in:
Nat 2023-03-13 18:43:11 -07:00
parent c378f18458
commit 9a5e5d5149
Signed by: nat
GPG Key ID: B53AB05285D710D6
1 changed files with 116 additions and 66 deletions

View File

@ -27,9 +27,16 @@
"- Obviously, visualizing this among many aspects of the other research questions would involve projecting the data onto a map of the United States, so visualizing this research question would motivate many of the visualizations for other components of this project"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Analysis Pipeline"
]
},
{
"cell_type": "code",
"execution_count": 50,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@ -41,12 +48,112 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data Wrangling"
"### Loading the data"
]
},
{
"cell_type": "code",
"execution_count": 76,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"## counties - Relating US counties to their long/lat position on the Earth\n",
"counties = pd.read_csv(\"../data/raw/us-county-boundaries.csv\", sep=\";\")\n",
"\n",
"## pol - Election results from the 2012 American presidential election\n",
"pol = pd.read_csv(\"../data/raw/countypres_2000-2020.csv\")\n",
"\n",
"## gb - the gaybourhoods dataset\n",
"gb = pd.read_csv(\"../data/raw/gaybourhoods.csv\")\n",
"\n",
"# cords - mapping zip codes to long/lat coordinates\n",
"cords = pd.read_csv(\"../data/raw/zip_lat_long.csv\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Cleaning the data"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"counties = counties.rename({\n",
" \"NAME\": \"name\",\n",
" \"INTPTLAT\": \"lat\",\n",
" \"INTPTLON\": \"long\",\n",
"}, axis=\"columns\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# We only want 2012--the latest election before the gb data was collected\n",
"pol = pol[pol[\"year\"] == 2012].reset_index()\n",
"\n",
"# Get rid of undesireable columns\n",
"pol = pol.drop([\n",
" \"year\", \"state\", \"county_fips\", \"office\",\n",
" \"candidate\", \"version\", \"mode\", \"index\",\n",
"], axis=\"columns\")\n",
"\n",
"# Change the column names to make them a little more friendly\n",
"pol.rename({\n",
" \"county_name\": \"county\",\n",
" \"state_po\": \"state\",\n",
" \"candidatevotes\": \"votes\",\n",
" \"totalvotes\": \"total\"\n",
"}, axis=\"columns\", inplace=True)\n",
"\n",
"# Make cells lowercase\n",
"pol[\"county\"] = pol[\"county\"].apply(lambda x: x.capitalize())\n",
"pol[\"party\"] = pol[\"party\"].apply(lambda x: x.capitalize())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# Let's add long/lat columns to gb\n",
"gb = gb.merge(cords, left_on=\"GEOID10\", right_on=\"ZIP\")\n",
"\n",
"# Get rid of unneeded columns\n",
"gb = gb.drop([\n",
" \"Mjoint_MF\", \"Mjoint_SS\", \"Mjoint_FF\", \"Mjoint_MM\",\n",
" \"Cns_TotHH\", \"Cns_UPSS\", \"Cns_UPFF\", \"Cns_UPMM\",\n",
" \"ParadeFlag\", \"FF_Tax\", \"FF_Cns\", \"MM_Tax\", \"MM_Cns\",\n",
" \"SS_Index_Weight\", \"Parade_Weight\", \"Bars_Weight\",\n",
" \"GEOID10\", \"ZIP\",\n",
"], axis=\"columns\")\n",
"\n",
"# There's a lot of info baked into some of these columns. Especially the composite indexes.\n",
"# We'll leave their names as is for easy reference even if they're a little ugly.\n",
"gb = gb.rename({\n",
" \"LAT\": \"lat\",\n",
" \"LNG\": \"long\",\n",
"}, axis=\"columns\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Process/Wrangle the data"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
@ -119,21 +226,12 @@
"4 Cedar IA 41.772360 -91.132610"
]
},
"execution_count": 76,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## counties - Relating US counties to their long/lat position on the Earth\n",
"counties = pd.read_csv(\"../data/raw/us-county-boundaries.csv\", sep=\";\")\n",
"\n",
"counties = counties.rename({\n",
" \"NAME\": \"name\",\n",
" \"INTPTLAT\": \"lat\",\n",
" \"INTPTLON\": \"long\",\n",
"}, axis=\"columns\")\n",
"\n",
"# Combine the county name with the state code\n",
"def combine_name_state(row):\n",
" row[\"name\"] = f\"{row['name']} {row['STUSAB']}\"\n",
@ -150,7 +248,7 @@
},
{
"cell_type": "code",
"execution_count": 107,
"execution_count": 12,
"metadata": {},
"outputs": [
{
@ -241,37 +339,12 @@
"4 Republican 66016 85338 0.773583 30.659218 -87.746067"
]
},
"execution_count": 107,
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## pol - Election results from the 2012 American presidential election\n",
"pol = pd.read_csv(\"../data/raw/countypres_2000-2020.csv\")\n",
"\n",
"# We only want 2012--the latest election before the gb data was collected\n",
"\n",
"pol = pol[pol[\"year\"] == 2012].reset_index()\n",
"\n",
"# Get rid of undesireable columns\n",
"pol = pol.drop([\n",
" \"year\", \"state\", \"county_fips\", \"office\",\n",
" \"candidate\", \"version\", \"mode\", \"index\",\n",
"], axis=\"columns\")\n",
"\n",
"# Change the column names to make them a little more friendly\n",
"pol.rename({\n",
" \"county_name\": \"county\",\n",
" \"state_po\": \"state\",\n",
" \"candidatevotes\": \"votes\",\n",
" \"totalvotes\": \"total\"\n",
"}, axis=\"columns\", inplace=True)\n",
"\n",
"# Make cells lowercase\n",
"pol[\"county\"] = pol[\"county\"].apply(lambda x: x.capitalize())\n",
"pol[\"party\"] = pol[\"party\"].apply(lambda x: x.capitalize())\n",
"\n",
"# Combine the county name with the state code\n",
"def combine_name_state(row):\n",
" row[\"county\"] = f\"{row['county']} {row['state']}\"\n",
@ -453,29 +526,6 @@
}
],
"source": [
"## gb - the gaybourhoods dataset\n",
"gb = pd.read_csv(\"../data/raw/gaybourhoods.csv\")\n",
"cords = pd.read_csv(\"../data/raw/zip_lat_long.csv\")\n",
"\n",
"# Let's add long/lat columns to gb\n",
"gb = gb.merge(cords, left_on=\"GEOID10\", right_on=\"ZIP\")\n",
"\n",
"# Get rid of unneeded columns\n",
"gb = gb.drop([\n",
" \"Mjoint_MF\", \"Mjoint_SS\", \"Mjoint_FF\", \"Mjoint_MM\",\n",
" \"Cns_TotHH\", \"Cns_UPSS\", \"Cns_UPFF\", \"Cns_UPMM\",\n",
" \"ParadeFlag\", \"FF_Tax\", \"FF_Cns\", \"MM_Tax\", \"MM_Cns\",\n",
" \"SS_Index_Weight\", \"Parade_Weight\", \"Bars_Weight\",\n",
" \"GEOID10\", \"ZIP\",\n",
"], axis=\"columns\")\n",
"\n",
"# There's a lot of info baked into some of these columns. Especially the composite indexes.\n",
"# We'll leave their names as is for easy reference even if they're a little ugly.\n",
"gb = gb.rename({\n",
" \"LAT\": \"lat\",\n",
" \"LNG\": \"long\",\n",
"}, axis=\"columns\")\n",
"\n",
"gb.to_csv(\"../data/processed/gaybourhoods-nat.csv\")\n",
"gb.head()"
]
@ -742,9 +792,9 @@
"\n",
"**Is there a correlation between political alignment & living in neighbourhoods with large quantities of LGBT people?**\n",
"1. Unify the political data with the gaybourhoods data set\n",
" 1. Establish the best way to measure the distance from a given `gb` and a county (Euclidean distance? Some other measurement?)\n",
" 2. Find the county that is closest to each observation by minimizing the function established in step (a)\n",
" c. Merge the two tables. Each `gb` observation should then include a political breakdown of the nearest county during the 2012 presidential election\n",
" 1. Establish the best way to measure the distance from a given `gb` and a county (Euclidean distance? Some other measurement?)\n",
" 2. Find the county that is closest to each observation by minimizing the function established in step (a)\n",
" 3. Merge the two tables. Each `gb` observation should then include a political breakdown of the nearest county during the 2012 presidential election\n",
"2. Use this information to plot queerness by different metrics against political alignment and measure the correlation\n",
"\n",
"**Is there a correlation between geographical stratums & being LGBT?**\n",