Merge pull request #12 from ubco-W2022T2-data301/add-datasets

Add IRS, election and zip code data sets
2023-03-02 17:27:52 -08:00 · 2023-03-02 17:27:52 -08:00 · 597759da16
parent 5c19c84bac e242fed907
commit 597759da16
6 changed files with 272708 additions and 5 deletions
--- a/README.md
+++ b/README.md
@ -30,7 +30,6 @@ Images coming soon.
 - [Men are from Chelsea, Women are from Park Slope](https://pudding.cool/2018/06/gayborhoods/)
  - The article for which the data was originally collected.
 - [The Gaybourhoods data set on Github](https://github.com/the-pudding/data/blob/master/gayborhoods/README.md)
-
-Sources of (potential) secondary data sets:
 - [Data set relating US ZIP codes to their coordinates](https://www.kaggle.com/datasets/joeleichter/us-zip-codes-with-lat-and-long)
- [Geographic situation of taxes payed in the US](https://www.irs.gov/statistics/soi-tax-stats-individual-income-tax-statistics-2018-zip-code-data-soi)
+- [Geographic situation of taxes payed in the US](https://www.irs.gov/statistics/soi-tax-stats-individual-income-tax-statistics-2015-zip-code-data-soi)
+- [County Presidential Election Returns 2000-2020](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/VOQCHQ)
--- a/analysis/analysis2.ipynb
+++ b/analysis/analysis2.ipynb
@ -26,7 +26,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
@ -231,7 +231,7 @@
       "[5 rows x 29 columns]"
      ]
     },
-     "execution_count": 1,
+     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -243,6 +243,247 @@
    "gaybourhoods.head(5)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Data wrangling"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# NOTE: This cell will not work unless this file is in the repository. The source\n",
+    "# can be found linked in the references section of the readme, however, it is too\n",
+    "# big for GitHub to handle.\n",
+    "irs = pd.read_csv(\"../data/raw/irs_2015.csv\")\n",
+    "\n",
+    "# Naively splitting the IRS data set in two. More formal data wrangling will\n",
+    "# come later\n",
+    "irs1 = irs.head(int(irs.shape[0] / 2))\n",
+    "irs2 = irs.tail(int(irs.shape[0] / 2))\n",
+    "\n",
+    "irs1.to_csv(\"../data/processed/irs_2015_1\", index=False)\n",
+    "irs2.to_csv(\"../data/processed/irs_2015_2\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>STATEFIPS</th>\n",
+       "      <th>STATE</th>\n",
+       "      <th>zipcode</th>\n",
+       "      <th>agi_stub</th>\n",
+       "      <th>N1</th>\n",
+       "      <th>mars1</th>\n",
+       "      <th>MARS2</th>\n",
+       "      <th>MARS4</th>\n",
+       "      <th>PREP</th>\n",
+       "      <th>N2</th>\n",
+       "      <th>...</th>\n",
+       "      <th>N10300</th>\n",
+       "      <th>A10300</th>\n",
+       "      <th>N85530</th>\n",
+       "      <th>A85530</th>\n",
+       "      <th>N85300</th>\n",
+       "      <th>A85300</th>\n",
+       "      <th>N11901</th>\n",
+       "      <th>A11901</th>\n",
+       "      <th>N11902</th>\n",
+       "      <th>A11902</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>AL</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>836320.0</td>\n",
+       "      <td>481570.0</td>\n",
+       "      <td>109790.0</td>\n",
+       "      <td>233260.0</td>\n",
+       "      <td>455560.0</td>\n",
+       "      <td>1356760.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>373410.0</td>\n",
+       "      <td>328469.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>61920.0</td>\n",
+       "      <td>48150.0</td>\n",
+       "      <td>732670.0</td>\n",
+       "      <td>1933120.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>AL</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>494830.0</td>\n",
+       "      <td>206630.0</td>\n",
+       "      <td>146250.0</td>\n",
+       "      <td>129390.0</td>\n",
+       "      <td>275920.0</td>\n",
+       "      <td>1010990.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>395880.0</td>\n",
+       "      <td>965011.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>73720.0</td>\n",
+       "      <td>107304.0</td>\n",
+       "      <td>415410.0</td>\n",
+       "      <td>1187403.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>AL</td>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>261250.0</td>\n",
+       "      <td>80720.0</td>\n",
+       "      <td>139280.0</td>\n",
+       "      <td>36130.0</td>\n",
+       "      <td>155100.0</td>\n",
+       "      <td>583910.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>251490.0</td>\n",
+       "      <td>1333418.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>64200.0</td>\n",
+       "      <td>139598.0</td>\n",
+       "      <td>193030.0</td>\n",
+       "      <td>536699.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>AL</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4</td>\n",
+       "      <td>166690.0</td>\n",
+       "      <td>28510.0</td>\n",
+       "      <td>124650.0</td>\n",
+       "      <td>10630.0</td>\n",
+       "      <td>99950.0</td>\n",
+       "      <td>423990.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>165320.0</td>\n",
+       "      <td>1414283.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>45460.0</td>\n",
+       "      <td>128823.0</td>\n",
+       "      <td>116440.0</td>\n",
+       "      <td>377177.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1</td>\n",
+       "      <td>AL</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5</td>\n",
+       "      <td>212660.0</td>\n",
+       "      <td>19520.0</td>\n",
+       "      <td>184320.0</td>\n",
+       "      <td>4830.0</td>\n",
+       "      <td>126860.0</td>\n",
+       "      <td>589490.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>212000.0</td>\n",
+       "      <td>3820152.0</td>\n",
+       "      <td>420.0</td>\n",
+       "      <td>168.0</td>\n",
+       "      <td>60.0</td>\n",
+       "      <td>31.0</td>\n",
+       "      <td>83330.0</td>\n",
+       "      <td>421004.0</td>\n",
+       "      <td>121570.0</td>\n",
+       "      <td>483682.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 131 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   STATEFIPS STATE  zipcode  agi_stub        N1     mars1     MARS2     MARS4  \\\n",
+       "0          1    AL        0         1  836320.0  481570.0  109790.0  233260.0   \n",
+       "1          1    AL        0         2  494830.0  206630.0  146250.0  129390.0   \n",
+       "2          1    AL        0         3  261250.0   80720.0  139280.0   36130.0   \n",
+       "3          1    AL        0         4  166690.0   28510.0  124650.0   10630.0   \n",
+       "4          1    AL        0         5  212660.0   19520.0  184320.0    4830.0   \n",
+       "\n",
+       "       PREP         N2  ...    N10300     A10300  N85530  A85530  N85300  \\\n",
+       "0  455560.0  1356760.0  ...  373410.0   328469.0     0.0     0.0     0.0   \n",
+       "1  275920.0  1010990.0  ...  395880.0   965011.0     0.0     0.0     0.0   \n",
+       "2  155100.0   583910.0  ...  251490.0  1333418.0     0.0     0.0     0.0   \n",
+       "3   99950.0   423990.0  ...  165320.0  1414283.0     0.0     0.0     0.0   \n",
+       "4  126860.0   589490.0  ...  212000.0  3820152.0   420.0   168.0    60.0   \n",
+       "\n",
+       "   A85300   N11901    A11901    N11902     A11902  \n",
+       "0     0.0  61920.0   48150.0  732670.0  1933120.0  \n",
+       "1     0.0  73720.0  107304.0  415410.0  1187403.0  \n",
+       "2     0.0  64200.0  139598.0  193030.0   536699.0  \n",
+       "3     0.0  45460.0  128823.0  116440.0   377177.0  \n",
+       "4    31.0  83330.0  421004.0  121570.0   483682.0  \n",
+       "\n",
+       "[5 rows x 131 columns]"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Now these two datasets can be joined and worked with\n",
+    "irs = pd.concat([\n",
+    "    pd.read_csv(\"../data/processed/irs_2015_1\"),\n",
+    "    pd.read_csv(\"../data/processed/irs_2015_2\")\n",
+    "])\n",
+    "irs.head()"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
--- a/data/processed/irs_2015_1
+++ b/data/processed/irs_2015_1
--- a/data/processed/irs_2015_2
+++ b/data/processed/irs_2015_2
--- a/data/raw/countypres_2000-2020.csv
+++ b/data/raw/countypres_2000-2020.csv
--- a/data/raw/zip_lat_long.csv
+++ b/data/raw/zip_lat_long.csv