diff --git a/analysis/analysis1.ipynb b/analysis/analysis1.ipynb index 2b5847f..76b1291 100644 --- a/analysis/analysis1.ipynb +++ b/analysis/analysis1.ipynb @@ -27,9 +27,16 @@ "- Obviously, visualizing this among many aspects of the other research questions would involve projecting the data onto a map of the United States, so visualizing this research question would motivate many of the visualizations for other components of this project" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analysis Pipeline" + ] + }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -41,12 +48,112 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Data Wrangling" + "### Loading the data" ] }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "## counties - Relating US counties to their long/lat position on the Earth\n", + "counties = pd.read_csv(\"../data/raw/us-county-boundaries.csv\", sep=\";\")\n", + "\n", + "## pol - Election results from the 2012 American presidential election\n", + "pol = pd.read_csv(\"../data/raw/countypres_2000-2020.csv\")\n", + "\n", + "## gb - the gaybourhoods dataset\n", + "gb = pd.read_csv(\"../data/raw/gaybourhoods.csv\")\n", + "\n", + "# cords - mapping zip codes to long/lat coordinates\n", + "cords = pd.read_csv(\"../data/raw/zip_lat_long.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cleaning the data" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "counties = counties.rename({\n", + " \"NAME\": \"name\",\n", + " \"INTPTLAT\": \"lat\",\n", + " \"INTPTLON\": \"long\",\n", + "}, axis=\"columns\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# We only want 2012--the latest election before the gb data was collected\n", + "pol = pol[pol[\"year\"] == 2012].reset_index()\n", + "\n", + "# Get rid of undesireable columns\n", + "pol = pol.drop([\n", + " \"year\", \"state\", \"county_fips\", \"office\",\n", + " \"candidate\", \"version\", \"mode\", \"index\",\n", + "], axis=\"columns\")\n", + "\n", + "# Change the column names to make them a little more friendly\n", + "pol.rename({\n", + " \"county_name\": \"county\",\n", + " \"state_po\": \"state\",\n", + " \"candidatevotes\": \"votes\",\n", + " \"totalvotes\": \"total\"\n", + "}, axis=\"columns\", inplace=True)\n", + "\n", + "# Make cells lowercase\n", + "pol[\"county\"] = pol[\"county\"].apply(lambda x: x.capitalize())\n", + "pol[\"party\"] = pol[\"party\"].apply(lambda x: x.capitalize())" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Let's add long/lat columns to gb\n", + "gb = gb.merge(cords, left_on=\"GEOID10\", right_on=\"ZIP\")\n", + "\n", + "# Get rid of unneeded columns\n", + "gb = gb.drop([\n", + " \"Mjoint_MF\", \"Mjoint_SS\", \"Mjoint_FF\", \"Mjoint_MM\",\n", + " \"Cns_TotHH\", \"Cns_UPSS\", \"Cns_UPFF\", \"Cns_UPMM\",\n", + " \"ParadeFlag\", \"FF_Tax\", \"FF_Cns\", \"MM_Tax\", \"MM_Cns\",\n", + " \"SS_Index_Weight\", \"Parade_Weight\", \"Bars_Weight\",\n", + " \"GEOID10\", \"ZIP\",\n", + "], axis=\"columns\")\n", + "\n", + "# There's a lot of info baked into some of these columns. Especially the composite indexes.\n", + "# We'll leave their names as is for easy reference even if they're a little ugly.\n", + "gb = gb.rename({\n", + " \"LAT\": \"lat\",\n", + " \"LNG\": \"long\",\n", + "}, axis=\"columns\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Process/Wrangle the data" + ] + }, + { + "cell_type": "code", + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -119,21 +226,12 @@ "4 Cedar IA 41.772360 -91.132610" ] }, - "execution_count": 76, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "## counties - Relating US counties to their long/lat position on the Earth\n", - "counties = pd.read_csv(\"../data/raw/us-county-boundaries.csv\", sep=\";\")\n", - "\n", - "counties = counties.rename({\n", - " \"NAME\": \"name\",\n", - " \"INTPTLAT\": \"lat\",\n", - " \"INTPTLON\": \"long\",\n", - "}, axis=\"columns\")\n", - "\n", "# Combine the county name with the state code\n", "def combine_name_state(row):\n", " row[\"name\"] = f\"{row['name']} {row['STUSAB']}\"\n", @@ -150,7 +248,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -241,37 +339,12 @@ "4 Republican 66016 85338 0.773583 30.659218 -87.746067" ] }, - "execution_count": 107, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "## pol - Election results from the 2012 American presidential election\n", - "pol = pd.read_csv(\"../data/raw/countypres_2000-2020.csv\")\n", - "\n", - "# We only want 2012--the latest election before the gb data was collected\n", - "\n", - "pol = pol[pol[\"year\"] == 2012].reset_index()\n", - "\n", - "# Get rid of undesireable columns\n", - "pol = pol.drop([\n", - " \"year\", \"state\", \"county_fips\", \"office\",\n", - " \"candidate\", \"version\", \"mode\", \"index\",\n", - "], axis=\"columns\")\n", - "\n", - "# Change the column names to make them a little more friendly\n", - "pol.rename({\n", - " \"county_name\": \"county\",\n", - " \"state_po\": \"state\",\n", - " \"candidatevotes\": \"votes\",\n", - " \"totalvotes\": \"total\"\n", - "}, axis=\"columns\", inplace=True)\n", - "\n", - "# Make cells lowercase\n", - "pol[\"county\"] = pol[\"county\"].apply(lambda x: x.capitalize())\n", - "pol[\"party\"] = pol[\"party\"].apply(lambda x: x.capitalize())\n", - "\n", "# Combine the county name with the state code\n", "def combine_name_state(row):\n", " row[\"county\"] = f\"{row['county']} {row['state']}\"\n", @@ -453,29 +526,6 @@ } ], "source": [ - "## gb - the gaybourhoods dataset\n", - "gb = pd.read_csv(\"../data/raw/gaybourhoods.csv\")\n", - "cords = pd.read_csv(\"../data/raw/zip_lat_long.csv\")\n", - "\n", - "# Let's add long/lat columns to gb\n", - "gb = gb.merge(cords, left_on=\"GEOID10\", right_on=\"ZIP\")\n", - "\n", - "# Get rid of unneeded columns\n", - "gb = gb.drop([\n", - " \"Mjoint_MF\", \"Mjoint_SS\", \"Mjoint_FF\", \"Mjoint_MM\",\n", - " \"Cns_TotHH\", \"Cns_UPSS\", \"Cns_UPFF\", \"Cns_UPMM\",\n", - " \"ParadeFlag\", \"FF_Tax\", \"FF_Cns\", \"MM_Tax\", \"MM_Cns\",\n", - " \"SS_Index_Weight\", \"Parade_Weight\", \"Bars_Weight\",\n", - " \"GEOID10\", \"ZIP\",\n", - "], axis=\"columns\")\n", - "\n", - "# There's a lot of info baked into some of these columns. Especially the composite indexes.\n", - "# We'll leave their names as is for easy reference even if they're a little ugly.\n", - "gb = gb.rename({\n", - " \"LAT\": \"lat\",\n", - " \"LNG\": \"long\",\n", - "}, axis=\"columns\")\n", - "\n", "gb.to_csv(\"../data/processed/gaybourhoods-nat.csv\")\n", "gb.head()" ] @@ -742,9 +792,9 @@ "\n", "**Is there a correlation between political alignment & living in neighbourhoods with large quantities of LGBT people?**\n", "1. Unify the political data with the gaybourhoods data set\n", - " 1. Establish the best way to measure the distance from a given `gb` and a county (Euclidean distance? Some other measurement?)\n", - " 2. Find the county that is closest to each observation by minimizing the function established in step (a)\n", - " c. Merge the two tables. Each `gb` observation should then include a political breakdown of the nearest county during the 2012 presidential election\n", + " 1. Establish the best way to measure the distance from a given `gb` and a county (Euclidean distance? Some other measurement?)\n", + " 2. Find the county that is closest to each observation by minimizing the function established in step (a)\n", + " 3. Merge the two tables. Each `gb` observation should then include a political breakdown of the nearest county during the 2012 presidential election\n", "2. Use this information to plot queerness by different metrics against political alignment and measure the correlation\n", "\n", "**Is there a correlation between geographical stratums & being LGBT?**\n",