Complete task 1 in analysis1.ipynb
Some additional work in the process/wrangling phase may be necessary, which will be determined in task 3. Closes #14
This commit is contained in:
parent
c378f18458
commit
9a5e5d5149
|
@ -27,9 +27,16 @@
|
||||||
"- Obviously, visualizing this among many aspects of the other research questions would involve projecting the data onto a map of the United States, so visualizing this research question would motivate many of the visualizations for other components of this project"
|
"- Obviously, visualizing this among many aspects of the other research questions would involve projecting the data onto a map of the United States, so visualizing this research question would motivate many of the visualizations for other components of this project"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Analysis Pipeline"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 50,
|
"execution_count": 3,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -41,12 +48,112 @@
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## Data Wrangling"
|
"### Loading the data"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 76,
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"## counties - Relating US counties to their long/lat position on the Earth\n",
|
||||||
|
"counties = pd.read_csv(\"../data/raw/us-county-boundaries.csv\", sep=\";\")\n",
|
||||||
|
"\n",
|
||||||
|
"## pol - Election results from the 2012 American presidential election\n",
|
||||||
|
"pol = pd.read_csv(\"../data/raw/countypres_2000-2020.csv\")\n",
|
||||||
|
"\n",
|
||||||
|
"## gb - the gaybourhoods dataset\n",
|
||||||
|
"gb = pd.read_csv(\"../data/raw/gaybourhoods.csv\")\n",
|
||||||
|
"\n",
|
||||||
|
"# cords - mapping zip codes to long/lat coordinates\n",
|
||||||
|
"cords = pd.read_csv(\"../data/raw/zip_lat_long.csv\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Cleaning the data"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"counties = counties.rename({\n",
|
||||||
|
" \"NAME\": \"name\",\n",
|
||||||
|
" \"INTPTLAT\": \"lat\",\n",
|
||||||
|
" \"INTPTLON\": \"long\",\n",
|
||||||
|
"}, axis=\"columns\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# We only want 2012--the latest election before the gb data was collected\n",
|
||||||
|
"pol = pol[pol[\"year\"] == 2012].reset_index()\n",
|
||||||
|
"\n",
|
||||||
|
"# Get rid of undesireable columns\n",
|
||||||
|
"pol = pol.drop([\n",
|
||||||
|
" \"year\", \"state\", \"county_fips\", \"office\",\n",
|
||||||
|
" \"candidate\", \"version\", \"mode\", \"index\",\n",
|
||||||
|
"], axis=\"columns\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Change the column names to make them a little more friendly\n",
|
||||||
|
"pol.rename({\n",
|
||||||
|
" \"county_name\": \"county\",\n",
|
||||||
|
" \"state_po\": \"state\",\n",
|
||||||
|
" \"candidatevotes\": \"votes\",\n",
|
||||||
|
" \"totalvotes\": \"total\"\n",
|
||||||
|
"}, axis=\"columns\", inplace=True)\n",
|
||||||
|
"\n",
|
||||||
|
"# Make cells lowercase\n",
|
||||||
|
"pol[\"county\"] = pol[\"county\"].apply(lambda x: x.capitalize())\n",
|
||||||
|
"pol[\"party\"] = pol[\"party\"].apply(lambda x: x.capitalize())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Let's add long/lat columns to gb\n",
|
||||||
|
"gb = gb.merge(cords, left_on=\"GEOID10\", right_on=\"ZIP\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Get rid of unneeded columns\n",
|
||||||
|
"gb = gb.drop([\n",
|
||||||
|
" \"Mjoint_MF\", \"Mjoint_SS\", \"Mjoint_FF\", \"Mjoint_MM\",\n",
|
||||||
|
" \"Cns_TotHH\", \"Cns_UPSS\", \"Cns_UPFF\", \"Cns_UPMM\",\n",
|
||||||
|
" \"ParadeFlag\", \"FF_Tax\", \"FF_Cns\", \"MM_Tax\", \"MM_Cns\",\n",
|
||||||
|
" \"SS_Index_Weight\", \"Parade_Weight\", \"Bars_Weight\",\n",
|
||||||
|
" \"GEOID10\", \"ZIP\",\n",
|
||||||
|
"], axis=\"columns\")\n",
|
||||||
|
"\n",
|
||||||
|
"# There's a lot of info baked into some of these columns. Especially the composite indexes.\n",
|
||||||
|
"# We'll leave their names as is for easy reference even if they're a little ugly.\n",
|
||||||
|
"gb = gb.rename({\n",
|
||||||
|
" \"LAT\": \"lat\",\n",
|
||||||
|
" \"LNG\": \"long\",\n",
|
||||||
|
"}, axis=\"columns\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Process/Wrangle the data"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
|
@ -119,21 +226,12 @@
|
||||||
"4 Cedar IA 41.772360 -91.132610"
|
"4 Cedar IA 41.772360 -91.132610"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 76,
|
"execution_count": 11,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"## counties - Relating US counties to their long/lat position on the Earth\n",
|
|
||||||
"counties = pd.read_csv(\"../data/raw/us-county-boundaries.csv\", sep=\";\")\n",
|
|
||||||
"\n",
|
|
||||||
"counties = counties.rename({\n",
|
|
||||||
" \"NAME\": \"name\",\n",
|
|
||||||
" \"INTPTLAT\": \"lat\",\n",
|
|
||||||
" \"INTPTLON\": \"long\",\n",
|
|
||||||
"}, axis=\"columns\")\n",
|
|
||||||
"\n",
|
|
||||||
"# Combine the county name with the state code\n",
|
"# Combine the county name with the state code\n",
|
||||||
"def combine_name_state(row):\n",
|
"def combine_name_state(row):\n",
|
||||||
" row[\"name\"] = f\"{row['name']} {row['STUSAB']}\"\n",
|
" row[\"name\"] = f\"{row['name']} {row['STUSAB']}\"\n",
|
||||||
|
@ -150,7 +248,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 107,
|
"execution_count": 12,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
|
@ -241,37 +339,12 @@
|
||||||
"4 Republican 66016 85338 0.773583 30.659218 -87.746067"
|
"4 Republican 66016 85338 0.773583 30.659218 -87.746067"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 107,
|
"execution_count": 12,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"## pol - Election results from the 2012 American presidential election\n",
|
|
||||||
"pol = pd.read_csv(\"../data/raw/countypres_2000-2020.csv\")\n",
|
|
||||||
"\n",
|
|
||||||
"# We only want 2012--the latest election before the gb data was collected\n",
|
|
||||||
"\n",
|
|
||||||
"pol = pol[pol[\"year\"] == 2012].reset_index()\n",
|
|
||||||
"\n",
|
|
||||||
"# Get rid of undesireable columns\n",
|
|
||||||
"pol = pol.drop([\n",
|
|
||||||
" \"year\", \"state\", \"county_fips\", \"office\",\n",
|
|
||||||
" \"candidate\", \"version\", \"mode\", \"index\",\n",
|
|
||||||
"], axis=\"columns\")\n",
|
|
||||||
"\n",
|
|
||||||
"# Change the column names to make them a little more friendly\n",
|
|
||||||
"pol.rename({\n",
|
|
||||||
" \"county_name\": \"county\",\n",
|
|
||||||
" \"state_po\": \"state\",\n",
|
|
||||||
" \"candidatevotes\": \"votes\",\n",
|
|
||||||
" \"totalvotes\": \"total\"\n",
|
|
||||||
"}, axis=\"columns\", inplace=True)\n",
|
|
||||||
"\n",
|
|
||||||
"# Make cells lowercase\n",
|
|
||||||
"pol[\"county\"] = pol[\"county\"].apply(lambda x: x.capitalize())\n",
|
|
||||||
"pol[\"party\"] = pol[\"party\"].apply(lambda x: x.capitalize())\n",
|
|
||||||
"\n",
|
|
||||||
"# Combine the county name with the state code\n",
|
"# Combine the county name with the state code\n",
|
||||||
"def combine_name_state(row):\n",
|
"def combine_name_state(row):\n",
|
||||||
" row[\"county\"] = f\"{row['county']} {row['state']}\"\n",
|
" row[\"county\"] = f\"{row['county']} {row['state']}\"\n",
|
||||||
|
@ -453,29 +526,6 @@
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"## gb - the gaybourhoods dataset\n",
|
|
||||||
"gb = pd.read_csv(\"../data/raw/gaybourhoods.csv\")\n",
|
|
||||||
"cords = pd.read_csv(\"../data/raw/zip_lat_long.csv\")\n",
|
|
||||||
"\n",
|
|
||||||
"# Let's add long/lat columns to gb\n",
|
|
||||||
"gb = gb.merge(cords, left_on=\"GEOID10\", right_on=\"ZIP\")\n",
|
|
||||||
"\n",
|
|
||||||
"# Get rid of unneeded columns\n",
|
|
||||||
"gb = gb.drop([\n",
|
|
||||||
" \"Mjoint_MF\", \"Mjoint_SS\", \"Mjoint_FF\", \"Mjoint_MM\",\n",
|
|
||||||
" \"Cns_TotHH\", \"Cns_UPSS\", \"Cns_UPFF\", \"Cns_UPMM\",\n",
|
|
||||||
" \"ParadeFlag\", \"FF_Tax\", \"FF_Cns\", \"MM_Tax\", \"MM_Cns\",\n",
|
|
||||||
" \"SS_Index_Weight\", \"Parade_Weight\", \"Bars_Weight\",\n",
|
|
||||||
" \"GEOID10\", \"ZIP\",\n",
|
|
||||||
"], axis=\"columns\")\n",
|
|
||||||
"\n",
|
|
||||||
"# There's a lot of info baked into some of these columns. Especially the composite indexes.\n",
|
|
||||||
"# We'll leave their names as is for easy reference even if they're a little ugly.\n",
|
|
||||||
"gb = gb.rename({\n",
|
|
||||||
" \"LAT\": \"lat\",\n",
|
|
||||||
" \"LNG\": \"long\",\n",
|
|
||||||
"}, axis=\"columns\")\n",
|
|
||||||
"\n",
|
|
||||||
"gb.to_csv(\"../data/processed/gaybourhoods-nat.csv\")\n",
|
"gb.to_csv(\"../data/processed/gaybourhoods-nat.csv\")\n",
|
||||||
"gb.head()"
|
"gb.head()"
|
||||||
]
|
]
|
||||||
|
@ -744,7 +794,7 @@
|
||||||
"1. Unify the political data with the gaybourhoods data set\n",
|
"1. Unify the political data with the gaybourhoods data set\n",
|
||||||
" 1. Establish the best way to measure the distance from a given `gb` and a county (Euclidean distance? Some other measurement?)\n",
|
" 1. Establish the best way to measure the distance from a given `gb` and a county (Euclidean distance? Some other measurement?)\n",
|
||||||
" 2. Find the county that is closest to each observation by minimizing the function established in step (a)\n",
|
" 2. Find the county that is closest to each observation by minimizing the function established in step (a)\n",
|
||||||
" c. Merge the two tables. Each `gb` observation should then include a political breakdown of the nearest county during the 2012 presidential election\n",
|
" 3. Merge the two tables. Each `gb` observation should then include a political breakdown of the nearest county during the 2012 presidential election\n",
|
||||||
"2. Use this information to plot queerness by different metrics against political alignment and measure the correlation\n",
|
"2. Use this information to plot queerness by different metrics against political alignment and measure the correlation\n",
|
||||||
"\n",
|
"\n",
|
||||||
"**Is there a correlation between geographical stratums & being LGBT?**\n",
|
"**Is there a correlation between geographical stratums & being LGBT?**\n",
|
||||||
|
|
Loading…
Reference in New Issue