Complete task 1 in analysis1.ipynb
Some additional work in the process/wrangling phase may be necessary, which will be determined in task 3. Closes #14
This commit is contained in:
parent
c378f18458
commit
9a5e5d5149
|
@ -27,9 +27,16 @@
|
|||
"- Obviously, visualizing this among many aspects of the other research questions would involve projecting the data onto a map of the United States, so visualizing this research question would motivate many of the visualizations for other components of this project"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Analysis Pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 50,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -41,12 +48,112 @@
|
|||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Data Wrangling"
|
||||
"### Loading the data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 76,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## counties - Relating US counties to their long/lat position on the Earth\n",
|
||||
"counties = pd.read_csv(\"../data/raw/us-county-boundaries.csv\", sep=\";\")\n",
|
||||
"\n",
|
||||
"## pol - Election results from the 2012 American presidential election\n",
|
||||
"pol = pd.read_csv(\"../data/raw/countypres_2000-2020.csv\")\n",
|
||||
"\n",
|
||||
"## gb - the gaybourhoods dataset\n",
|
||||
"gb = pd.read_csv(\"../data/raw/gaybourhoods.csv\")\n",
|
||||
"\n",
|
||||
"# cords - mapping zip codes to long/lat coordinates\n",
|
||||
"cords = pd.read_csv(\"../data/raw/zip_lat_long.csv\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Cleaning the data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"counties = counties.rename({\n",
|
||||
" \"NAME\": \"name\",\n",
|
||||
" \"INTPTLAT\": \"lat\",\n",
|
||||
" \"INTPTLON\": \"long\",\n",
|
||||
"}, axis=\"columns\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# We only want 2012--the latest election before the gb data was collected\n",
|
||||
"pol = pol[pol[\"year\"] == 2012].reset_index()\n",
|
||||
"\n",
|
||||
"# Get rid of undesireable columns\n",
|
||||
"pol = pol.drop([\n",
|
||||
" \"year\", \"state\", \"county_fips\", \"office\",\n",
|
||||
" \"candidate\", \"version\", \"mode\", \"index\",\n",
|
||||
"], axis=\"columns\")\n",
|
||||
"\n",
|
||||
"# Change the column names to make them a little more friendly\n",
|
||||
"pol.rename({\n",
|
||||
" \"county_name\": \"county\",\n",
|
||||
" \"state_po\": \"state\",\n",
|
||||
" \"candidatevotes\": \"votes\",\n",
|
||||
" \"totalvotes\": \"total\"\n",
|
||||
"}, axis=\"columns\", inplace=True)\n",
|
||||
"\n",
|
||||
"# Make cells lowercase\n",
|
||||
"pol[\"county\"] = pol[\"county\"].apply(lambda x: x.capitalize())\n",
|
||||
"pol[\"party\"] = pol[\"party\"].apply(lambda x: x.capitalize())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Let's add long/lat columns to gb\n",
|
||||
"gb = gb.merge(cords, left_on=\"GEOID10\", right_on=\"ZIP\")\n",
|
||||
"\n",
|
||||
"# Get rid of unneeded columns\n",
|
||||
"gb = gb.drop([\n",
|
||||
" \"Mjoint_MF\", \"Mjoint_SS\", \"Mjoint_FF\", \"Mjoint_MM\",\n",
|
||||
" \"Cns_TotHH\", \"Cns_UPSS\", \"Cns_UPFF\", \"Cns_UPMM\",\n",
|
||||
" \"ParadeFlag\", \"FF_Tax\", \"FF_Cns\", \"MM_Tax\", \"MM_Cns\",\n",
|
||||
" \"SS_Index_Weight\", \"Parade_Weight\", \"Bars_Weight\",\n",
|
||||
" \"GEOID10\", \"ZIP\",\n",
|
||||
"], axis=\"columns\")\n",
|
||||
"\n",
|
||||
"# There's a lot of info baked into some of these columns. Especially the composite indexes.\n",
|
||||
"# We'll leave their names as is for easy reference even if they're a little ugly.\n",
|
||||
"gb = gb.rename({\n",
|
||||
" \"LAT\": \"lat\",\n",
|
||||
" \"LNG\": \"long\",\n",
|
||||
"}, axis=\"columns\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Process/Wrangle the data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -119,21 +226,12 @@
|
|||
"4 Cedar IA 41.772360 -91.132610"
|
||||
]
|
||||
},
|
||||
"execution_count": 76,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"## counties - Relating US counties to their long/lat position on the Earth\n",
|
||||
"counties = pd.read_csv(\"../data/raw/us-county-boundaries.csv\", sep=\";\")\n",
|
||||
"\n",
|
||||
"counties = counties.rename({\n",
|
||||
" \"NAME\": \"name\",\n",
|
||||
" \"INTPTLAT\": \"lat\",\n",
|
||||
" \"INTPTLON\": \"long\",\n",
|
||||
"}, axis=\"columns\")\n",
|
||||
"\n",
|
||||
"# Combine the county name with the state code\n",
|
||||
"def combine_name_state(row):\n",
|
||||
" row[\"name\"] = f\"{row['name']} {row['STUSAB']}\"\n",
|
||||
|
@ -150,7 +248,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 107,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -241,37 +339,12 @@
|
|||
"4 Republican 66016 85338 0.773583 30.659218 -87.746067"
|
||||
]
|
||||
},
|
||||
"execution_count": 107,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"## pol - Election results from the 2012 American presidential election\n",
|
||||
"pol = pd.read_csv(\"../data/raw/countypres_2000-2020.csv\")\n",
|
||||
"\n",
|
||||
"# We only want 2012--the latest election before the gb data was collected\n",
|
||||
"\n",
|
||||
"pol = pol[pol[\"year\"] == 2012].reset_index()\n",
|
||||
"\n",
|
||||
"# Get rid of undesireable columns\n",
|
||||
"pol = pol.drop([\n",
|
||||
" \"year\", \"state\", \"county_fips\", \"office\",\n",
|
||||
" \"candidate\", \"version\", \"mode\", \"index\",\n",
|
||||
"], axis=\"columns\")\n",
|
||||
"\n",
|
||||
"# Change the column names to make them a little more friendly\n",
|
||||
"pol.rename({\n",
|
||||
" \"county_name\": \"county\",\n",
|
||||
" \"state_po\": \"state\",\n",
|
||||
" \"candidatevotes\": \"votes\",\n",
|
||||
" \"totalvotes\": \"total\"\n",
|
||||
"}, axis=\"columns\", inplace=True)\n",
|
||||
"\n",
|
||||
"# Make cells lowercase\n",
|
||||
"pol[\"county\"] = pol[\"county\"].apply(lambda x: x.capitalize())\n",
|
||||
"pol[\"party\"] = pol[\"party\"].apply(lambda x: x.capitalize())\n",
|
||||
"\n",
|
||||
"# Combine the county name with the state code\n",
|
||||
"def combine_name_state(row):\n",
|
||||
" row[\"county\"] = f\"{row['county']} {row['state']}\"\n",
|
||||
|
@ -453,29 +526,6 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"## gb - the gaybourhoods dataset\n",
|
||||
"gb = pd.read_csv(\"../data/raw/gaybourhoods.csv\")\n",
|
||||
"cords = pd.read_csv(\"../data/raw/zip_lat_long.csv\")\n",
|
||||
"\n",
|
||||
"# Let's add long/lat columns to gb\n",
|
||||
"gb = gb.merge(cords, left_on=\"GEOID10\", right_on=\"ZIP\")\n",
|
||||
"\n",
|
||||
"# Get rid of unneeded columns\n",
|
||||
"gb = gb.drop([\n",
|
||||
" \"Mjoint_MF\", \"Mjoint_SS\", \"Mjoint_FF\", \"Mjoint_MM\",\n",
|
||||
" \"Cns_TotHH\", \"Cns_UPSS\", \"Cns_UPFF\", \"Cns_UPMM\",\n",
|
||||
" \"ParadeFlag\", \"FF_Tax\", \"FF_Cns\", \"MM_Tax\", \"MM_Cns\",\n",
|
||||
" \"SS_Index_Weight\", \"Parade_Weight\", \"Bars_Weight\",\n",
|
||||
" \"GEOID10\", \"ZIP\",\n",
|
||||
"], axis=\"columns\")\n",
|
||||
"\n",
|
||||
"# There's a lot of info baked into some of these columns. Especially the composite indexes.\n",
|
||||
"# We'll leave their names as is for easy reference even if they're a little ugly.\n",
|
||||
"gb = gb.rename({\n",
|
||||
" \"LAT\": \"lat\",\n",
|
||||
" \"LNG\": \"long\",\n",
|
||||
"}, axis=\"columns\")\n",
|
||||
"\n",
|
||||
"gb.to_csv(\"../data/processed/gaybourhoods-nat.csv\")\n",
|
||||
"gb.head()"
|
||||
]
|
||||
|
@ -742,9 +792,9 @@
|
|||
"\n",
|
||||
"**Is there a correlation between political alignment & living in neighbourhoods with large quantities of LGBT people?**\n",
|
||||
"1. Unify the political data with the gaybourhoods data set\n",
|
||||
" 1. Establish the best way to measure the distance from a given `gb` and a county (Euclidean distance? Some other measurement?)\n",
|
||||
" 2. Find the county that is closest to each observation by minimizing the function established in step (a)\n",
|
||||
" c. Merge the two tables. Each `gb` observation should then include a political breakdown of the nearest county during the 2012 presidential election\n",
|
||||
" 1. Establish the best way to measure the distance from a given `gb` and a county (Euclidean distance? Some other measurement?)\n",
|
||||
" 2. Find the county that is closest to each observation by minimizing the function established in step (a)\n",
|
||||
" 3. Merge the two tables. Each `gb` observation should then include a political breakdown of the nearest county during the 2012 presidential election\n",
|
||||
"2. Use this information to plot queerness by different metrics against political alignment and measure the correlation\n",
|
||||
"\n",
|
||||
"**Is there a correlation between geographical stratums & being LGBT?**\n",
|
||||
|
|
Loading…
Reference in New Issue