Complete task 1 in analysis1.ipynb

Some additional work in the process/wrangling phase may be necessary, which will be determined in task 3. Closes #14
2023-03-13 18:43:11 -07:00 · 2023-03-13 18:43:11 -07:00 · 9a5e5d5149
parent c378f18458
commit 9a5e5d5149
1 changed files with 116 additions and 66 deletions
--- a/analysis/analysis1.ipynb
+++ b/analysis/analysis1.ipynb
@ -27,9 +27,16 @@
    "- Obviously, visualizing this among many aspects of the other research questions would involve projecting the data onto a map of the United States, so visualizing this research question would motivate many of the visualizations for other components of this project"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Analysis Pipeline"
+   ]
+  },
  {
   "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@ -41,12 +48,112 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Data Wrangling"
+    "### Loading the data"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 76,
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## counties - Relating US counties to their long/lat position on the Earth\n",
+    "counties = pd.read_csv(\"../data/raw/us-county-boundaries.csv\", sep=\";\")\n",
+    "\n",
+    "## pol - Election results from the 2012 American presidential election\n",
+    "pol = pd.read_csv(\"../data/raw/countypres_2000-2020.csv\")\n",
+    "\n",
+    "## gb - the gaybourhoods dataset\n",
+    "gb = pd.read_csv(\"../data/raw/gaybourhoods.csv\")\n",
+    "\n",
+    "# cords - mapping zip codes to long/lat coordinates\n",
+    "cords = pd.read_csv(\"../data/raw/zip_lat_long.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Cleaning the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "counties = counties.rename({\n",
+    "    \"NAME\": \"name\",\n",
+    "    \"INTPTLAT\": \"lat\",\n",
+    "    \"INTPTLON\": \"long\",\n",
+    "}, axis=\"columns\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We only want 2012--the latest election before the gb data was collected\n",
+    "pol = pol[pol[\"year\"] == 2012].reset_index()\n",
+    "\n",
+    "# Get rid of undesireable columns\n",
+    "pol = pol.drop([\n",
+    "    \"year\", \"state\", \"county_fips\", \"office\",\n",
+    "    \"candidate\", \"version\", \"mode\", \"index\",\n",
+    "], axis=\"columns\")\n",
+    "\n",
+    "# Change the column names to make them a little more friendly\n",
+    "pol.rename({\n",
+    "    \"county_name\": \"county\",\n",
+    "    \"state_po\": \"state\",\n",
+    "    \"candidatevotes\": \"votes\",\n",
+    "    \"totalvotes\": \"total\"\n",
+    "}, axis=\"columns\", inplace=True)\n",
+    "\n",
+    "# Make cells lowercase\n",
+    "pol[\"county\"] = pol[\"county\"].apply(lambda x: x.capitalize())\n",
+    "pol[\"party\"] = pol[\"party\"].apply(lambda x: x.capitalize())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Let's add long/lat columns to gb\n",
+    "gb = gb.merge(cords, left_on=\"GEOID10\", right_on=\"ZIP\")\n",
+    "\n",
+    "# Get rid of unneeded columns\n",
+    "gb = gb.drop([\n",
+    "    \"Mjoint_MF\", \"Mjoint_SS\", \"Mjoint_FF\", \"Mjoint_MM\",\n",
+    "    \"Cns_TotHH\", \"Cns_UPSS\", \"Cns_UPFF\", \"Cns_UPMM\",\n",
+    "    \"ParadeFlag\", \"FF_Tax\", \"FF_Cns\", \"MM_Tax\", \"MM_Cns\",\n",
+    "    \"SS_Index_Weight\", \"Parade_Weight\", \"Bars_Weight\",\n",
+    "    \"GEOID10\", \"ZIP\",\n",
+    "], axis=\"columns\")\n",
+    "\n",
+    "# There's a lot of info baked into some of these columns. Especially the composite indexes.\n",
+    "# We'll leave their names as is for easy reference even if they're a little ugly.\n",
+    "gb = gb.rename({\n",
+    "    \"LAT\": \"lat\",\n",
+    "    \"LNG\": \"long\",\n",
+    "}, axis=\"columns\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Process/Wrangle the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
@ -119,21 +226,12 @@
       "4     Cedar IA  41.772360 -91.132610"
      ]
     },
-     "execution_count": 76,
+     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "## counties - Relating US counties to their long/lat position on the Earth\n",
-    "counties = pd.read_csv(\"../data/raw/us-county-boundaries.csv\", sep=\";\")\n",
-    "\n",
-    "counties = counties.rename({\n",
-    "    \"NAME\": \"name\",\n",
-    "    \"INTPTLAT\": \"lat\",\n",
-    "    \"INTPTLON\": \"long\",\n",
-    "}, axis=\"columns\")\n",
-    "\n",
    "# Combine the county name with the state code\n",
    "def combine_name_state(row):\n",
    "    row[\"name\"] = f\"{row['name']} {row['STUSAB']}\"\n",
@ -150,7 +248,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 107,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
@ -241,37 +339,12 @@
       "4  Republican  66016  85338  0.773583  30.659218 -87.746067"
      ]
     },
-     "execution_count": 107,
+     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "## pol - Election results from the 2012 American presidential election\n",
-    "pol = pd.read_csv(\"../data/raw/countypres_2000-2020.csv\")\n",
-    "\n",
-    "# We only want 2012--the latest election before the gb data was collected\n",
-    "\n",
-    "pol = pol[pol[\"year\"] == 2012].reset_index()\n",
-    "\n",
-    "# Get rid of undesireable columns\n",
-    "pol = pol.drop([\n",
-    "    \"year\", \"state\", \"county_fips\", \"office\",\n",
-    "    \"candidate\", \"version\", \"mode\", \"index\",\n",
-    "], axis=\"columns\")\n",
-    "\n",
-    "# Change the column names to make them a little more friendly\n",
-    "pol.rename({\n",
-    "    \"county_name\": \"county\",\n",
-    "    \"state_po\": \"state\",\n",
-    "    \"candidatevotes\": \"votes\",\n",
-    "    \"totalvotes\": \"total\"\n",
-    "}, axis=\"columns\", inplace=True)\n",
-    "\n",
-    "# Make cells lowercase\n",
-    "pol[\"county\"] = pol[\"county\"].apply(lambda x: x.capitalize())\n",
-    "pol[\"party\"] = pol[\"party\"].apply(lambda x: x.capitalize())\n",
-    "\n",
    "# Combine the county name with the state code\n",
    "def combine_name_state(row):\n",
    "    row[\"county\"] = f\"{row['county']} {row['state']}\"\n",
@ -453,29 +526,6 @@
    }
   ],
   "source": [
-    "## gb - the gaybourhoods dataset\n",
-    "gb = pd.read_csv(\"../data/raw/gaybourhoods.csv\")\n",
-    "cords = pd.read_csv(\"../data/raw/zip_lat_long.csv\")\n",
-    "\n",
-    "# Let's add long/lat columns to gb\n",
-    "gb = gb.merge(cords, left_on=\"GEOID10\", right_on=\"ZIP\")\n",
-    "\n",
-    "# Get rid of unneeded columns\n",
-    "gb = gb.drop([\n",
-    "    \"Mjoint_MF\", \"Mjoint_SS\", \"Mjoint_FF\", \"Mjoint_MM\",\n",
-    "    \"Cns_TotHH\", \"Cns_UPSS\", \"Cns_UPFF\", \"Cns_UPMM\",\n",
-    "    \"ParadeFlag\", \"FF_Tax\", \"FF_Cns\", \"MM_Tax\", \"MM_Cns\",\n",
-    "    \"SS_Index_Weight\", \"Parade_Weight\", \"Bars_Weight\",\n",
-    "    \"GEOID10\", \"ZIP\",\n",
-    "], axis=\"columns\")\n",
-    "\n",
-    "# There's a lot of info baked into some of these columns. Especially the composite indexes.\n",
-    "# We'll leave their names as is for easy reference even if they're a little ugly.\n",
-    "gb = gb.rename({\n",
-    "    \"LAT\": \"lat\",\n",
-    "    \"LNG\": \"long\",\n",
-    "}, axis=\"columns\")\n",
-    "\n",
    "gb.to_csv(\"../data/processed/gaybourhoods-nat.csv\")\n",
    "gb.head()"
   ]
@ -742,9 +792,9 @@
    "\n",
    "**Is there a correlation between political alignment & living in neighbourhoods with large quantities of LGBT people?**\n",
    "1. Unify the political data with the gaybourhoods data set\n",
-    "  1. Establish the best way to measure the distance from a given `gb` and a county (Euclidean distance? Some other measurement?)\n",
-    "  2. Find the county that is closest to each observation by minimizing the function established in step (a)\n",
-    "  c. Merge the two tables. Each `gb` observation should then include a political breakdown of the nearest county during the 2012 presidential election\n",
+    "    1. Establish the best way to measure the distance from a given `gb` and a county (Euclidean distance? Some other measurement?)\n",
+    "    2. Find the county that is closest to each observation by minimizing the function established in step (a)\n",
+    "    3. Merge the two tables. Each `gb` observation should then include a political breakdown of the nearest county during the 2012 presidential election\n",
    "2. Use this information to plot queerness by different metrics against political alignment and measure the correlation\n",
    "\n",
    "**Is there a correlation between geographical stratums & being LGBT?**\n",