Move analysis1 chains to code.project_functions1

2023-03-20 11:46:45 -07:00 · 2023-03-20 11:46:45 -07:00 · 4566009cae
parent 6b4e79d55e
commit 4566009cae
4 changed files with 8844 additions and 42186 deletions
--- a/analysis/analysis1.ipynb
+++ b/analysis/analysis1.ipynb
@ -36,414 +36,39 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
-    "import seaborn as sns"
+    "import seaborn as sns\n",
    "\n",
    "# Absolutely diabolical method of doing relative imports with a package who shares its name with\n",
    "# something in the stdlib in Jupyter Lab because it seems impossible otherwise\n",
    "__import__(\"sys\").path.append(\"./code\")\n",
    "from project_functions1 import *"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "### Loading the data"
+    "## Data Analysis Pipeline"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
-    "## counties - Relating US counties to their long/lat position on the Earth\n",
+    "# Now in one, new-and-improved, non-descript method imported from another file\n",
-    "counties = pd.read_csv(\"../data/raw/us-county-boundaries.csv\", sep=\";\")\n",
+    "gb, pol, counties, cords = load_and_process()"
    "\n",
    "## pol - Election results from the 2012 American presidential election\n",
    "pol = pd.read_csv(\"../data/raw/countypres_2000-2020.csv\")\n",
    "\n",
    "## gb - the gaybourhoods dataset\n",
    "gb = pd.read_csv(\"../data/raw/gaybourhoods.csv\")\n",
    "\n",
    "# cords - mapping zip codes to long/lat coordinates\n",
    "cords = pd.read_csv(\"../data/raw/zip_lat_long.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Cleaning the data"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "counties = counties.rename({\n",
    "    \"NAME\": \"name\",\n",
    "    \"INTPTLAT\": \"lat\",\n",
    "    \"INTPTLON\": \"long\",\n",
    "}, axis=\"columns\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>state</th>\n",
       "      <th>county</th>\n",
       "      <th>party</th>\n",
       "      <th>votes</th>\n",
       "      <th>total</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AL</td>\n",
       "      <td>Autauga</td>\n",
       "      <td>Democrat</td>\n",
       "      <td>6363</td>\n",
       "      <td>23932</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AL</td>\n",
       "      <td>Autauga</td>\n",
       "      <td>Republican</td>\n",
       "      <td>17379</td>\n",
       "      <td>23932</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>AL</td>\n",
       "      <td>Autauga</td>\n",
       "      <td>Other</td>\n",
       "      <td>190</td>\n",
       "      <td>23932</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>AL</td>\n",
       "      <td>Baldwin</td>\n",
       "      <td>Democrat</td>\n",
       "      <td>18424</td>\n",
       "      <td>85338</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>AL</td>\n",
       "      <td>Baldwin</td>\n",
       "      <td>Republican</td>\n",
       "      <td>66016</td>\n",
       "      <td>85338</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  state   county       party  votes  total\n",
       "0    AL  Autauga    Democrat   6363  23932\n",
       "1    AL  Autauga  Republican  17379  23932\n",
       "2    AL  Autauga       Other    190  23932\n",
       "3    AL  Baldwin    Democrat  18424  85338\n",
       "4    AL  Baldwin  Republican  66016  85338"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# We only want 2012--the latest election before the gb data was collected\n",
    "pol = pol.query(\"`year` == 2012\") \\\n",
    "    .reset_index() \\\n",
    "    .drop([\n",
    "        \"year\", \"state\", \"county_fips\", \"office\",\n",
    "        \"candidate\", \"version\", \"mode\", \"index\",\n",
    "    ], axis=\"columns\") \\\n",
    "    .rename({\n",
    "        \"county_name\": \"county\",\n",
    "        \"state_po\": \"state\",\n",
    "        \"candidatevotes\": \"votes\",\n",
    "        \"totalvotes\": \"total\"\n",
    "    }, axis=\"columns\") \\\n",
    "    .apply(lambda x: x.str.capitalize() if x.name == \"county\" or x.name == \"party\" else x)\n",
    "\n",
    "pol.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's add long/lat columns to gb\n",
    "gb = gb.merge(cords, left_on=\"GEOID10\", right_on=\"ZIP\") \\\n",
    "    .drop([\n",
    "        \"Mjoint_MF\", \"Mjoint_SS\", \"Mjoint_FF\", \"Mjoint_MM\",\n",
    "        \"Cns_TotHH\", \"Cns_UPSS\", \"Cns_UPFF\", \"Cns_UPMM\",\n",
    "        \"ParadeFlag\", \"FF_Tax\", \"FF_Cns\", \"MM_Tax\", \"MM_Cns\",\n",
    "        \"SS_Index_Weight\", \"Parade_Weight\", \"Bars_Weight\",\n",
    "        \"GEOID10\", \"ZIP\",\n",
    "    ], axis=\"columns\") \\\n",
    "    .rename({\n",
    "        \"LAT\": \"lat\",\n",
    "        \"LNG\": \"long\",\n",
    "    }, axis=\"columns\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Process/Wrangle the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>lat</th>\n",
       "      <th>long</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Hancock OH</td>\n",
       "      <td>41.000471</td>\n",
       "      <td>-83.666033</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Stafford VA</td>\n",
       "      <td>38.413261</td>\n",
       "      <td>-77.451334</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Webster NE</td>\n",
       "      <td>40.180646</td>\n",
       "      <td>-98.498590</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Dimmit TX</td>\n",
       "      <td>28.423587</td>\n",
       "      <td>-99.765871</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Cedar IA</td>\n",
       "      <td>41.772360</td>\n",
       "      <td>-91.132610</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          name        lat       long\n",
       "0   Hancock OH  41.000471 -83.666033\n",
       "1  Stafford VA  38.413261 -77.451334\n",
       "2   Webster NE  40.180646 -98.498590\n",
       "3    Dimmit TX  28.423587 -99.765871\n",
       "4     Cedar IA  41.772360 -91.132610"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Combine the county name with the state code\n",
    "def combine_name_state(row):\n",
    "    row[\"name\"] = f\"{row['name']} {row['STUSAB']}\"\n",
    "    return row\n",
    "\n",
    "counties = counties.apply(combine_name_state, axis=\"columns\") \\\n",
    "    .drop([\"STUSAB\"], axis=\"columns\")\n",
    "\n",
    "counties.to_csv(\"../data/processed/us-county-boundaries.csv\")\n",
    "counties.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>county</th>\n",
       "      <th>party</th>\n",
       "      <th>votes</th>\n",
       "      <th>total</th>\n",
       "      <th>lat</th>\n",
       "      <th>long</th>\n",
       "      <th>percent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Autauga AL</td>\n",
       "      <td>Democrat</td>\n",
       "      <td>6363</td>\n",
       "      <td>23932</td>\n",
       "      <td>32.532237</td>\n",
       "      <td>-86.646439</td>\n",
       "      <td>0.265878</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Autauga AL</td>\n",
       "      <td>Republican</td>\n",
       "      <td>17379</td>\n",
       "      <td>23932</td>\n",
       "      <td>32.532237</td>\n",
       "      <td>-86.646439</td>\n",
       "      <td>0.726183</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Autauga AL</td>\n",
       "      <td>Other</td>\n",
       "      <td>190</td>\n",
       "      <td>23932</td>\n",
       "      <td>32.532237</td>\n",
       "      <td>-86.646439</td>\n",
       "      <td>0.007939</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Baldwin AL</td>\n",
       "      <td>Democrat</td>\n",
       "      <td>18424</td>\n",
       "      <td>85338</td>\n",
       "      <td>30.659218</td>\n",
       "      <td>-87.746067</td>\n",
       "      <td>0.215894</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Baldwin AL</td>\n",
       "      <td>Republican</td>\n",
       "      <td>66016</td>\n",
       "      <td>85338</td>\n",
       "      <td>30.659218</td>\n",
       "      <td>-87.746067</td>\n",
       "      <td>0.773583</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       county       party  votes  total        lat       long   percent\n",
       "0  Autauga AL    Democrat   6363  23932  32.532237 -86.646439  0.265878\n",
       "1  Autauga AL  Republican  17379  23932  32.532237 -86.646439  0.726183\n",
       "2  Autauga AL       Other    190  23932  32.532237 -86.646439  0.007939\n",
       "3  Baldwin AL    Democrat  18424  85338  30.659218 -87.746067  0.215894\n",
       "4  Baldwin AL  Republican  66016  85338  30.659218 -87.746067  0.773583"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Combine the county name with the state code\n",
    "def combine_name_state(row):\n",
    "    row[\"county\"] = f\"{row['county']} {row['state']}\"\n",
    "    return row\n",
    "\n",
    "pol = pol.apply(combine_name_state, axis=\"columns\") \\\n",
    "    .merge(counties, left_on=\"county\", right_on=\"name\") \\\n",
    "    .drop([\"state\", \"name\"], axis=\"columns\") \\\n",
    "    .assign(percent=lambda x: x.votes/x.total)\n",
    "\n",
    "pol.to_csv(\"../data/processed/election-2012.csv\", index=False)\n",
    "pol.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
@ -596,7 +221,7 @@
       "4  37.773134 -122.411167  "
      ]
     },
-     "execution_count": 8,
+     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -606,6 +231,115 @@
    "gb.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>county</th>\n",
       "      <th>party</th>\n",
       "      <th>votes</th>\n",
       "      <th>total</th>\n",
       "      <th>lat</th>\n",
       "      <th>long</th>\n",
       "      <th>percent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Autauga AL</td>\n",
       "      <td>Democrat</td>\n",
       "      <td>6363</td>\n",
       "      <td>23932</td>\n",
       "      <td>32.532237</td>\n",
       "      <td>-86.646439</td>\n",
       "      <td>0.265878</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Autauga AL</td>\n",
       "      <td>Republican</td>\n",
       "      <td>17379</td>\n",
       "      <td>23932</td>\n",
       "      <td>32.532237</td>\n",
       "      <td>-86.646439</td>\n",
       "      <td>0.726183</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Autauga AL</td>\n",
       "      <td>Other</td>\n",
       "      <td>190</td>\n",
       "      <td>23932</td>\n",
       "      <td>32.532237</td>\n",
       "      <td>-86.646439</td>\n",
       "      <td>0.007939</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Baldwin AL</td>\n",
       "      <td>Democrat</td>\n",
       "      <td>18424</td>\n",
       "      <td>85338</td>\n",
       "      <td>30.659218</td>\n",
       "      <td>-87.746067</td>\n",
       "      <td>0.215894</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Baldwin AL</td>\n",
       "      <td>Republican</td>\n",
       "      <td>66016</td>\n",
       "      <td>85338</td>\n",
       "      <td>30.659218</td>\n",
       "      <td>-87.746067</td>\n",
       "      <td>0.773583</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       county       party  votes  total        lat       long   percent\n",
       "0  Autauga AL    Democrat   6363  23932  32.532237 -86.646439  0.265878\n",
       "1  Autauga AL  Republican  17379  23932  32.532237 -86.646439  0.726183\n",
       "2  Autauga AL       Other    190  23932  32.532237 -86.646439  0.007939\n",
       "3  Baldwin AL    Democrat  18424  85338  30.659218 -87.746067  0.215894\n",
       "4  Baldwin AL  Republican  66016  85338  30.659218 -87.746067  0.773583"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pol.to_csv(\"../data/processed/election-2012.csv\")\n",
    "pol.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
--- a/analysis/code/project_functions1.py
+++ b/analysis/code/project_functions1.py
@ -0,0 +1,69 @@
 import pandas as pd
 def load_and_process():
    # cords - mapping zip codes to long/lat coordinates
    cords = pd.read_csv("../data/raw/zip_lat_long.csv")
    ## counties - Relating US counties to their long/lat position on the Earth
    # Combine the county name with the state code
    def combine_name_state(row):
        row["name"] = f"{row['name']} {row['STUSAB']}"
        return row
    counties = (
        pd.read_csv("../data/raw/us-county-boundaries.csv", sep=";")
        .rename({
            "NAME": "name",
            "INTPTLAT": "lat",
            "INTPTLON": "long",
        }, axis="columns")
        .apply(combine_name_state, axis="columns")
        .drop(["STUSAB"], axis="columns")
    )
    ## pol - Election results from the 2012 American presidential election
    def combine_name_state(row):
        row["county"] = f"{row['county']} {row['state']}"
        return row
    pol = (
        pd.read_csv("../data/raw/countypres_2000-2020.csv")
        .query("`year` == 2012")
        .reset_index()
        .drop([
            "year", "state", "county_fips", "office",
            "candidate", "version", "mode", "index",
        ], axis="columns")
        .rename({
            "county_name": "county",
            "state_po": "state",
            "candidatevotes": "votes",
            "totalvotes": "total"
        }, axis="columns")
        .apply(lambda x: x.str.capitalize() if x.name == "county" or x.name == "party" else x)
        .apply(combine_name_state, axis="columns")
        .merge(counties, left_on="county", right_on="name")
        .drop(["state", "name"], axis="columns")
        .assign(percent=lambda x: x.votes/x.total)
    )
    ## gb - the gaybourhoods dataset
    gb = (
        pd.read_csv("../data/raw/gaybourhoods.csv")
        .merge(cords, left_on="GEOID10", right_on="ZIP") \
        .drop([
            "Mjoint_MF", "Mjoint_SS", "Mjoint_FF", "Mjoint_MM",
            "Cns_TotHH", "Cns_UPSS", "Cns_UPFF", "Cns_UPMM",
            "ParadeFlag", "FF_Tax", "FF_Cns", "MM_Tax", "MM_Cns",
            "SS_Index_Weight", "Parade_Weight", "Bars_Weight",
            "GEOID10", "ZIP",
        ], axis="columns") \
        .rename({
            "LAT": "lat",
            "LNG": "long",
        }, axis="columns")
    )
    return (gb, pol, counties, cords)
--- a/analysis/code/zip_lat_long.csv
+++ b/analysis/code/zip_lat_long.csv
--- a/data/processed/election-2012.csv
+++ b/data/processed/election-2012.csv