Complete task 1 in analysis1.ipynb

Some additional work in the process/wrangling phase may be necessary,
which will be determined in task 3.

Closes #14
This commit is contained in:
Nat 2023-03-13 18:43:11 -07:00
parent c378f18458
commit 9a5e5d5149
Signed by: nat
GPG Key ID: B53AB05285D710D6
1 changed files with 116 additions and 66 deletions

View File

@ -27,9 +27,16 @@
"- Obviously, visualizing this among many aspects of the other research questions would involve projecting the data onto a map of the United States, so visualizing this research question would motivate many of the visualizations for other components of this project" "- Obviously, visualizing this among many aspects of the other research questions would involve projecting the data onto a map of the United States, so visualizing this research question would motivate many of the visualizations for other components of this project"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Analysis Pipeline"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 50, "execution_count": 3,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -41,12 +48,112 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Data Wrangling" "### Loading the data"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 76, "execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"## counties - Relating US counties to their long/lat position on the Earth\n",
"counties = pd.read_csv(\"../data/raw/us-county-boundaries.csv\", sep=\";\")\n",
"\n",
"## pol - Election results from the 2012 American presidential election\n",
"pol = pd.read_csv(\"../data/raw/countypres_2000-2020.csv\")\n",
"\n",
"## gb - the gaybourhoods dataset\n",
"gb = pd.read_csv(\"../data/raw/gaybourhoods.csv\")\n",
"\n",
"# cords - mapping zip codes to long/lat coordinates\n",
"cords = pd.read_csv(\"../data/raw/zip_lat_long.csv\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Cleaning the data"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"counties = counties.rename({\n",
" \"NAME\": \"name\",\n",
" \"INTPTLAT\": \"lat\",\n",
" \"INTPTLON\": \"long\",\n",
"}, axis=\"columns\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# We only want 2012--the latest election before the gb data was collected\n",
"pol = pol[pol[\"year\"] == 2012].reset_index()\n",
"\n",
"# Get rid of undesireable columns\n",
"pol = pol.drop([\n",
" \"year\", \"state\", \"county_fips\", \"office\",\n",
" \"candidate\", \"version\", \"mode\", \"index\",\n",
"], axis=\"columns\")\n",
"\n",
"# Change the column names to make them a little more friendly\n",
"pol.rename({\n",
" \"county_name\": \"county\",\n",
" \"state_po\": \"state\",\n",
" \"candidatevotes\": \"votes\",\n",
" \"totalvotes\": \"total\"\n",
"}, axis=\"columns\", inplace=True)\n",
"\n",
"# Make cells lowercase\n",
"pol[\"county\"] = pol[\"county\"].apply(lambda x: x.capitalize())\n",
"pol[\"party\"] = pol[\"party\"].apply(lambda x: x.capitalize())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# Let's add long/lat columns to gb\n",
"gb = gb.merge(cords, left_on=\"GEOID10\", right_on=\"ZIP\")\n",
"\n",
"# Get rid of unneeded columns\n",
"gb = gb.drop([\n",
" \"Mjoint_MF\", \"Mjoint_SS\", \"Mjoint_FF\", \"Mjoint_MM\",\n",
" \"Cns_TotHH\", \"Cns_UPSS\", \"Cns_UPFF\", \"Cns_UPMM\",\n",
" \"ParadeFlag\", \"FF_Tax\", \"FF_Cns\", \"MM_Tax\", \"MM_Cns\",\n",
" \"SS_Index_Weight\", \"Parade_Weight\", \"Bars_Weight\",\n",
" \"GEOID10\", \"ZIP\",\n",
"], axis=\"columns\")\n",
"\n",
"# There's a lot of info baked into some of these columns. Especially the composite indexes.\n",
"# We'll leave their names as is for easy reference even if they're a little ugly.\n",
"gb = gb.rename({\n",
" \"LAT\": \"lat\",\n",
" \"LNG\": \"long\",\n",
"}, axis=\"columns\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Process/Wrangle the data"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -119,21 +226,12 @@
"4 Cedar IA 41.772360 -91.132610" "4 Cedar IA 41.772360 -91.132610"
] ]
}, },
"execution_count": 76, "execution_count": 11,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"## counties - Relating US counties to their long/lat position on the Earth\n",
"counties = pd.read_csv(\"../data/raw/us-county-boundaries.csv\", sep=\";\")\n",
"\n",
"counties = counties.rename({\n",
" \"NAME\": \"name\",\n",
" \"INTPTLAT\": \"lat\",\n",
" \"INTPTLON\": \"long\",\n",
"}, axis=\"columns\")\n",
"\n",
"# Combine the county name with the state code\n", "# Combine the county name with the state code\n",
"def combine_name_state(row):\n", "def combine_name_state(row):\n",
" row[\"name\"] = f\"{row['name']} {row['STUSAB']}\"\n", " row[\"name\"] = f\"{row['name']} {row['STUSAB']}\"\n",
@ -150,7 +248,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 107, "execution_count": 12,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -241,37 +339,12 @@
"4 Republican 66016 85338 0.773583 30.659218 -87.746067" "4 Republican 66016 85338 0.773583 30.659218 -87.746067"
] ]
}, },
"execution_count": 107, "execution_count": 12,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"## pol - Election results from the 2012 American presidential election\n",
"pol = pd.read_csv(\"../data/raw/countypres_2000-2020.csv\")\n",
"\n",
"# We only want 2012--the latest election before the gb data was collected\n",
"\n",
"pol = pol[pol[\"year\"] == 2012].reset_index()\n",
"\n",
"# Get rid of undesireable columns\n",
"pol = pol.drop([\n",
" \"year\", \"state\", \"county_fips\", \"office\",\n",
" \"candidate\", \"version\", \"mode\", \"index\",\n",
"], axis=\"columns\")\n",
"\n",
"# Change the column names to make them a little more friendly\n",
"pol.rename({\n",
" \"county_name\": \"county\",\n",
" \"state_po\": \"state\",\n",
" \"candidatevotes\": \"votes\",\n",
" \"totalvotes\": \"total\"\n",
"}, axis=\"columns\", inplace=True)\n",
"\n",
"# Make cells lowercase\n",
"pol[\"county\"] = pol[\"county\"].apply(lambda x: x.capitalize())\n",
"pol[\"party\"] = pol[\"party\"].apply(lambda x: x.capitalize())\n",
"\n",
"# Combine the county name with the state code\n", "# Combine the county name with the state code\n",
"def combine_name_state(row):\n", "def combine_name_state(row):\n",
" row[\"county\"] = f\"{row['county']} {row['state']}\"\n", " row[\"county\"] = f\"{row['county']} {row['state']}\"\n",
@ -453,29 +526,6 @@
} }
], ],
"source": [ "source": [
"## gb - the gaybourhoods dataset\n",
"gb = pd.read_csv(\"../data/raw/gaybourhoods.csv\")\n",
"cords = pd.read_csv(\"../data/raw/zip_lat_long.csv\")\n",
"\n",
"# Let's add long/lat columns to gb\n",
"gb = gb.merge(cords, left_on=\"GEOID10\", right_on=\"ZIP\")\n",
"\n",
"# Get rid of unneeded columns\n",
"gb = gb.drop([\n",
" \"Mjoint_MF\", \"Mjoint_SS\", \"Mjoint_FF\", \"Mjoint_MM\",\n",
" \"Cns_TotHH\", \"Cns_UPSS\", \"Cns_UPFF\", \"Cns_UPMM\",\n",
" \"ParadeFlag\", \"FF_Tax\", \"FF_Cns\", \"MM_Tax\", \"MM_Cns\",\n",
" \"SS_Index_Weight\", \"Parade_Weight\", \"Bars_Weight\",\n",
" \"GEOID10\", \"ZIP\",\n",
"], axis=\"columns\")\n",
"\n",
"# There's a lot of info baked into some of these columns. Especially the composite indexes.\n",
"# We'll leave their names as is for easy reference even if they're a little ugly.\n",
"gb = gb.rename({\n",
" \"LAT\": \"lat\",\n",
" \"LNG\": \"long\",\n",
"}, axis=\"columns\")\n",
"\n",
"gb.to_csv(\"../data/processed/gaybourhoods-nat.csv\")\n", "gb.to_csv(\"../data/processed/gaybourhoods-nat.csv\")\n",
"gb.head()" "gb.head()"
] ]
@ -742,9 +792,9 @@
"\n", "\n",
"**Is there a correlation between political alignment & living in neighbourhoods with large quantities of LGBT people?**\n", "**Is there a correlation between political alignment & living in neighbourhoods with large quantities of LGBT people?**\n",
"1. Unify the political data with the gaybourhoods data set\n", "1. Unify the political data with the gaybourhoods data set\n",
" 1. Establish the best way to measure the distance from a given `gb` and a county (Euclidean distance? Some other measurement?)\n", " 1. Establish the best way to measure the distance from a given `gb` and a county (Euclidean distance? Some other measurement?)\n",
" 2. Find the county that is closest to each observation by minimizing the function established in step (a)\n", " 2. Find the county that is closest to each observation by minimizing the function established in step (a)\n",
" c. Merge the two tables. Each `gb` observation should then include a political breakdown of the nearest county during the 2012 presidential election\n", " 3. Merge the two tables. Each `gb` observation should then include a political breakdown of the nearest county during the 2012 presidential election\n",
"2. Use this information to plot queerness by different metrics against political alignment and measure the correlation\n", "2. Use this information to plot queerness by different metrics against political alignment and measure the correlation\n",
"\n", "\n",
"**Is there a correlation between geographical stratums & being LGBT?**\n", "**Is there a correlation between geographical stratums & being LGBT?**\n",