analysis 2 wrangled

tentative; anal 2 ready for plotting & visualization
2023-03-03 00:51:14 -08:00 · 2023-03-03 00:51:14 -08:00 · 8de27d9c5e
parent f0fd88fb08
commit 8de27d9c5e
1 changed files with 196 additions and 210 deletions
--- a/analysis/analysis2.ipynb
+++ b/analysis/analysis2.ipynb
@ -26,7 +26,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
@ -231,13 +231,15 @@
       "[5 rows x 29 columns]"
      ]
     },
-     "execution_count": 16,
+     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "import numpy as np\n",
    "\n",
    "gaybourhoods = pd.read_csv(\"../data/raw/gaybourhoods.csv\")\n",
    "gaybourhoods.head(5)"
@ -252,13 +254,33 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 44,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: '../data/raw/irs_2015.csv'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[44], line 5\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[39m# NOTE: This cell will not work unless this file is in the repository. The source\u001b[39;00m\n\u001b[0;32m      2\u001b[0m \u001b[39m# can be found linked in the references section of the readme, however, it is too\u001b[39;00m\n\u001b[0;32m      3\u001b[0m \u001b[39m# big for GitHub to handle.\u001b[39;00m\n\u001b[1;32m----> 5\u001b[0m irs \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39;49mread_csv(\u001b[39m\"\u001b[39;49m\u001b[39m../data/raw/irs_2015.csv\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n\u001b[0;32m      7\u001b[0m \u001b[39m# Naively splitting the IRS data set in two. More formal data wrangling will\u001b[39;00m\n\u001b[0;32m      8\u001b[0m \u001b[39m# come later\u001b[39;00m\n\u001b[0;32m      9\u001b[0m irs1 \u001b[39m=\u001b[39m irs\u001b[39m.\u001b[39mhead(\u001b[39mint\u001b[39m(irs\u001b[39m.\u001b[39mshape[\u001b[39m0\u001b[39m] \u001b[39m/\u001b[39m \u001b[39m2\u001b[39m))\n",
+      "File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\util\\_decorators.py:211\u001b[0m, in \u001b[0;36mdeprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m    209\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[0;32m    210\u001b[0m         kwargs[new_arg_name] \u001b[39m=\u001b[39m new_arg_value\n\u001b[1;32m--> 211\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
+      "File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\util\\_decorators.py:331\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m    325\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(args) \u001b[39m>\u001b[39m num_allow_args:\n\u001b[0;32m    326\u001b[0m     warnings\u001b[39m.\u001b[39mwarn(\n\u001b[0;32m    327\u001b[0m         msg\u001b[39m.\u001b[39mformat(arguments\u001b[39m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[0;32m    328\u001b[0m         \u001b[39mFutureWarning\u001b[39;00m,\n\u001b[0;32m    329\u001b[0m         stacklevel\u001b[39m=\u001b[39mfind_stack_level(),\n\u001b[0;32m    330\u001b[0m     )\n\u001b[1;32m--> 331\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
+      "File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:950\u001b[0m, in \u001b[0;36mread_csv\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[0;32m    935\u001b[0m kwds_defaults \u001b[39m=\u001b[39m _refine_defaults_read(\n\u001b[0;32m    936\u001b[0m     dialect,\n\u001b[0;32m    937\u001b[0m     delimiter,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    946\u001b[0m     defaults\u001b[39m=\u001b[39m{\u001b[39m\"\u001b[39m\u001b[39mdelimiter\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39m,\u001b[39m\u001b[39m\"\u001b[39m},\n\u001b[0;32m    947\u001b[0m )\n\u001b[0;32m    948\u001b[0m kwds\u001b[39m.\u001b[39mupdate(kwds_defaults)\n\u001b[1;32m--> 950\u001b[0m \u001b[39mreturn\u001b[39;00m _read(filepath_or_buffer, kwds)\n",
+      "File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:605\u001b[0m, in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m    602\u001b[0m _validate_names(kwds\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mnames\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mNone\u001b[39;00m))\n\u001b[0;32m    604\u001b[0m \u001b[39m# Create the parser.\u001b[39;00m\n\u001b[1;32m--> 605\u001b[0m parser \u001b[39m=\u001b[39m TextFileReader(filepath_or_buffer, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwds)\n\u001b[0;32m    607\u001b[0m \u001b[39mif\u001b[39;00m chunksize \u001b[39mor\u001b[39;00m iterator:\n\u001b[0;32m    608\u001b[0m     \u001b[39mreturn\u001b[39;00m parser\n",
+      "File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1442\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[1;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[0;32m   1439\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39moptions[\u001b[39m\"\u001b[39m\u001b[39mhas_index_names\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m kwds[\u001b[39m\"\u001b[39m\u001b[39mhas_index_names\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[0;32m   1441\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles: IOHandles \u001b[39m|\u001b[39m \u001b[39mNone\u001b[39;00m \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m-> 1442\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_engine \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_make_engine(f, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mengine)\n",
+      "File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1735\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[1;34m(self, f, engine)\u001b[0m\n\u001b[0;32m   1733\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m mode:\n\u001b[0;32m   1734\u001b[0m         mode \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m-> 1735\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles \u001b[39m=\u001b[39m get_handle(\n\u001b[0;32m   1736\u001b[0m     f,\n\u001b[0;32m   1737\u001b[0m     mode,\n\u001b[0;32m   1738\u001b[0m     encoding\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mencoding\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[0;32m   1739\u001b[0m     compression\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mcompression\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[0;32m   1740\u001b[0m     memory_map\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mmemory_map\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mFalse\u001b[39;49;00m),\n\u001b[0;32m   1741\u001b[0m     is_text\u001b[39m=\u001b[39;49mis_text,\n\u001b[0;32m   1742\u001b[0m     errors\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mencoding_errors\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mstrict\u001b[39;49m\u001b[39m\"\u001b[39;49m),\n\u001b[0;32m   1743\u001b[0m     storage_options\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mstorage_options\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[0;32m   1744\u001b[0m )\n\u001b[0;32m   1745\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m   1746\u001b[0m f \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles\u001b[39m.\u001b[39mhandle\n",
+      "File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\io\\common.py:856\u001b[0m, in \u001b[0;36mget_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[0;32m    851\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(handle, \u001b[39mstr\u001b[39m):\n\u001b[0;32m    852\u001b[0m     \u001b[39m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[0;32m    853\u001b[0m     \u001b[39m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[0;32m    854\u001b[0m     \u001b[39mif\u001b[39;00m ioargs\u001b[39m.\u001b[39mencoding \u001b[39mand\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m ioargs\u001b[39m.\u001b[39mmode:\n\u001b[0;32m    855\u001b[0m         \u001b[39m# Encoding\u001b[39;00m\n\u001b[1;32m--> 856\u001b[0m         handle \u001b[39m=\u001b[39m \u001b[39mopen\u001b[39m(\n\u001b[0;32m    857\u001b[0m             handle,\n\u001b[0;32m    858\u001b[0m             ioargs\u001b[39m.\u001b[39mmode,\n\u001b[0;32m    859\u001b[0m             encoding\u001b[39m=\u001b[39mioargs\u001b[39m.\u001b[39mencoding,\n\u001b[0;32m    860\u001b[0m             errors\u001b[39m=\u001b[39merrors,\n\u001b[0;32m    861\u001b[0m             newline\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m    862\u001b[0m         )\n\u001b[0;32m    863\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[0;32m    864\u001b[0m         \u001b[39m# Binary mode\u001b[39;00m\n\u001b[0;32m    865\u001b[0m         handle \u001b[39m=\u001b[39m \u001b[39mopen\u001b[39m(handle, ioargs\u001b[39m.\u001b[39mmode)\n",
+      "\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../data/raw/irs_2015.csv'"
+     ]
+    }
+   ],
   "source": [
    "# NOTE: This cell will not work unless this file is in the repository. The source\n",
    "# can be found linked in the references section of the readme, however, it is too\n",
    "# big for GitHub to handle.\n",
+    "\n",
    "irs = pd.read_csv(\"../data/raw/irs_2015.csv\")\n",
    "\n",
    "# Naively splitting the IRS data set in two. More formal data wrangling will\n",
@ -272,216 +294,173 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 45,
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>STATEFIPS</th>\n",
-       "      <th>STATE</th>\n",
-       "      <th>zipcode</th>\n",
-       "      <th>agi_stub</th>\n",
-       "      <th>N1</th>\n",
-       "      <th>mars1</th>\n",
-       "      <th>MARS2</th>\n",
-       "      <th>MARS4</th>\n",
-       "      <th>PREP</th>\n",
-       "      <th>N2</th>\n",
-       "      <th>...</th>\n",
-       "      <th>N10300</th>\n",
-       "      <th>A10300</th>\n",
-       "      <th>N85530</th>\n",
-       "      <th>A85530</th>\n",
-       "      <th>N85300</th>\n",
-       "      <th>A85300</th>\n",
-       "      <th>N11901</th>\n",
-       "      <th>A11901</th>\n",
-       "      <th>N11902</th>\n",
-       "      <th>A11902</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1</td>\n",
-       "      <td>AL</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>836320.0</td>\n",
-       "      <td>481570.0</td>\n",
-       "      <td>109790.0</td>\n",
-       "      <td>233260.0</td>\n",
-       "      <td>455560.0</td>\n",
-       "      <td>1356760.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>373410.0</td>\n",
-       "      <td>328469.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>61920.0</td>\n",
-       "      <td>48150.0</td>\n",
-       "      <td>732670.0</td>\n",
-       "      <td>1933120.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>1</td>\n",
-       "      <td>AL</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>494830.0</td>\n",
-       "      <td>206630.0</td>\n",
-       "      <td>146250.0</td>\n",
-       "      <td>129390.0</td>\n",
-       "      <td>275920.0</td>\n",
-       "      <td>1010990.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>395880.0</td>\n",
-       "      <td>965011.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>73720.0</td>\n",
-       "      <td>107304.0</td>\n",
-       "      <td>415410.0</td>\n",
-       "      <td>1187403.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>1</td>\n",
-       "      <td>AL</td>\n",
-       "      <td>0</td>\n",
-       "      <td>3</td>\n",
-       "      <td>261250.0</td>\n",
-       "      <td>80720.0</td>\n",
-       "      <td>139280.0</td>\n",
-       "      <td>36130.0</td>\n",
-       "      <td>155100.0</td>\n",
-       "      <td>583910.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>251490.0</td>\n",
-       "      <td>1333418.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>64200.0</td>\n",
-       "      <td>139598.0</td>\n",
-       "      <td>193030.0</td>\n",
-       "      <td>536699.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>1</td>\n",
-       "      <td>AL</td>\n",
-       "      <td>0</td>\n",
-       "      <td>4</td>\n",
-       "      <td>166690.0</td>\n",
-       "      <td>28510.0</td>\n",
-       "      <td>124650.0</td>\n",
-       "      <td>10630.0</td>\n",
-       "      <td>99950.0</td>\n",
-       "      <td>423990.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>165320.0</td>\n",
-       "      <td>1414283.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>45460.0</td>\n",
-       "      <td>128823.0</td>\n",
-       "      <td>116440.0</td>\n",
-       "      <td>377177.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>1</td>\n",
-       "      <td>AL</td>\n",
-       "      <td>0</td>\n",
-       "      <td>5</td>\n",
-       "      <td>212660.0</td>\n",
-       "      <td>19520.0</td>\n",
-       "      <td>184320.0</td>\n",
-       "      <td>4830.0</td>\n",
-       "      <td>126860.0</td>\n",
-       "      <td>589490.0</td>\n",
-       "      <td>...</td>\n",
-       "      <td>212000.0</td>\n",
-       "      <td>3820152.0</td>\n",
-       "      <td>420.0</td>\n",
-       "      <td>168.0</td>\n",
-       "      <td>60.0</td>\n",
-       "      <td>31.0</td>\n",
-       "      <td>83330.0</td>\n",
-       "      <td>421004.0</td>\n",
-       "      <td>121570.0</td>\n",
-       "      <td>483682.0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5 rows × 131 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   STATEFIPS STATE  zipcode  agi_stub        N1     mars1     MARS2     MARS4  \\\n",
-       "0          1    AL        0         1  836320.0  481570.0  109790.0  233260.0   \n",
-       "1          1    AL        0         2  494830.0  206630.0  146250.0  129390.0   \n",
-       "2          1    AL        0         3  261250.0   80720.0  139280.0   36130.0   \n",
-       "3          1    AL        0         4  166690.0   28510.0  124650.0   10630.0   \n",
-       "4          1    AL        0         5  212660.0   19520.0  184320.0    4830.0   \n",
-       "\n",
-       "       PREP         N2  ...    N10300     A10300  N85530  A85530  N85300  \\\n",
-       "0  455560.0  1356760.0  ...  373410.0   328469.0     0.0     0.0     0.0   \n",
-       "1  275920.0  1010990.0  ...  395880.0   965011.0     0.0     0.0     0.0   \n",
-       "2  155100.0   583910.0  ...  251490.0  1333418.0     0.0     0.0     0.0   \n",
-       "3   99950.0   423990.0  ...  165320.0  1414283.0     0.0     0.0     0.0   \n",
-       "4  126860.0   589490.0  ...  212000.0  3820152.0   420.0   168.0    60.0   \n",
-       "\n",
-       "   A85300   N11901    A11901    N11902     A11902  \n",
-       "0     0.0  61920.0   48150.0  732670.0  1933120.0  \n",
-       "1     0.0  73720.0  107304.0  415410.0  1187403.0  \n",
-       "2     0.0  64200.0  139598.0  193030.0   536699.0  \n",
-       "3     0.0  45460.0  128823.0  116440.0   377177.0  \n",
-       "4    31.0  83330.0  421004.0  121570.0   483682.0  \n",
-       "\n",
-       "[5 rows x 131 columns]"
-      ]
-     },
-     "execution_count": 20,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "# Now these two datasets can be joined and worked with\n",
    "irs = pd.concat([\n",
    "    pd.read_csv(\"../data/processed/irs_2015_1\"),\n",
    "    pd.read_csv(\"../data/processed/irs_2015_2\")\n",
    "])\n",
-    "irs.head()"
+    "# irs.head()\n",
+    "\n",
+    "\n",
+    "#selected data: ZIPCODE - this will be used in conjunction with the rest of the set\n",
+    "            #   N2 - population of zip code\n",
+    "            \n",
+    "            #data of intrest\n",
+    "                #     A11900\tTotal overpayments amount\n",
+    "                #   AGI_STUB - metric for income\n",
+    "\n",
+    "# print(irs.loc[irs['zipcode']==90069])\n",
+    "# df = {irs['zipcode'], irs['N2']}\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                 zip    population        income  overall tax paid\n",
+      "count  166698.000000  1.666980e+05  166698.00000      1.666980e+05\n",
+      "mean    48877.636432  3.432536e+03       3.50000      1.844871e+03\n",
+      "std     27146.337114  6.676873e+04       1.70783      5.785610e+04\n",
+      "min         0.000000  0.000000e+00       1.00000      0.000000e+00\n",
+      "25%     27040.000000  1.400000e+02       2.00000      1.600000e+01\n",
+      "50%     48879.000000  5.100000e+02       3.50000      1.440000e+02\n",
+      "75%     70607.000000  2.000000e+03       5.00000      6.310000e+02\n",
+      "max     99999.000000  9.566490e+06       6.00000      1.557123e+07\n",
+      "          zip  population  income  overall tax paid\n",
+      "0           0   1356760.0       1           48150.0\n",
+      "1           0   1010990.0       2          107304.0\n",
+      "2           0    583910.0       3          139598.0\n",
+      "3           0    423990.0       4          128823.0\n",
+      "4           0    589490.0       5          421004.0\n",
+      "...       ...         ...     ...               ...\n",
+      "166693  99999      6660.0       2             869.0\n",
+      "166694  99999      5440.0       3            1273.0\n",
+      "166695  99999      4780.0       4            1635.0\n",
+      "166696  99999      6930.0       5            5576.0\n",
+      "166697  99999      1890.0       6           14487.0\n",
+      "\n",
+      "[166698 rows x 4 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "#wrangle tax\n",
+    "taxdf = pd.DataFrame(zip(irs['zipcode'], irs['N2'], irs['agi_stub'], irs['A11901']))\n",
+    "taxdf.columns=('zip', 'population', 'income', 'overall tax paid')\n",
+    "print(taxdf.describe())\n",
+    "print(taxdf)\n",
+    "# print(irs.columns)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                zip  gay tax rate\n",
+      "count   2328.000000   2328.000000\n",
+      "mean   48616.478522   4103.440722\n",
+      "std    35481.240641   3140.699446\n",
+      "min     1730.000000      0.000000\n",
+      "25%    11362.750000   1767.500000\n",
+      "50%    46351.000000   3635.000000\n",
+      "75%    80234.250000   5745.000000\n",
+      "max    98686.000000  24560.000000\n",
+      "        zip  gay tax rate\n",
+      "0     90069          2120\n",
+      "1     94114          5080\n",
+      "2     10011          5790\n",
+      "3     10014          3510\n",
+      "4     94103          2660\n",
+      "...     ...           ...\n",
+      "2323  97208             0\n",
+      "2324  98154             0\n",
+      "2325  98158             0\n",
+      "2326  98174             0\n",
+      "2327  98195             0\n",
+      "\n",
+      "[2328 rows x 2 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "#wrangle gay\n",
+    "gaydf = pd.DataFrame(zip(gaybourhoods['GEOID10'], gaybourhoods['Tax_Mjoint']))\n",
+    "gaydf.columns=(('zip', 'gay tax rate'))\n",
+    "\n",
+    "print(gaydf.describe())\n",
+    "print(gaydf)\n",
+    "\n",
+    "# gaybourhoods.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                zip     population  gay tax rate  overall tax paid  income\n",
+      "count   2184.000000    2184.000000   2184.000000       2184.000000  2184.0\n",
+      "mean   48935.203297   26691.730769   4373.997253        596.719322     1.0\n",
+      "std    35451.335807   17960.713867   3054.620840        615.174358     0.0\n",
+      "min     1730.000000     160.000000      0.000000          0.000000     1.0\n",
+      "25%    11360.750000   13337.500000   2110.000000        217.000000     1.0\n",
+      "50%    60023.500000   24070.000000   3900.000000        434.000000     1.0\n",
+      "75%    80227.250000   35640.000000   5902.500000        777.250000     1.0\n",
+      "max    98686.000000  114420.000000  24560.000000       9166.000000     1.0\n",
+      "------------------------------------------------------------------------\n",
+      "         zip  population  gay tax rate  overall tax paid  income\n",
+      "zip                                                             \n",
+      "1730    1730     13570.0          3260             150.0       1\n",
+      "1731    1731      2450.0           550               0.0       1\n",
+      "1742    1742     17170.0          4220             297.0       1\n",
+      "1760    1760     34350.0          7880             468.0       1\n",
+      "1770    1770      4310.0          1060              46.0       1\n",
+      "...      ...         ...           ...               ...     ...\n",
+      "98682  98682     57010.0         11080             703.0       1\n",
+      "98683  98683     30700.0          6470             358.0       1\n",
+      "98684  98684     27630.0          5390             371.0       1\n",
+      "98685  98685     27540.0          6490             298.0       1\n",
+      "98686  98686     17800.0          4120             215.0       1\n",
+      "\n",
+      "[2184 rows x 5 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "#merge\n",
+    "df = pd.merge(taxdf, gaydf)\n",
+    "\n",
+    "# print(df)\n",
+    "\n",
+    "df2 = df.groupby(df['zip']).aggregate({ 'zip':'first',\n",
+    "                                        'population': 'sum',\n",
+    "                                        'gay tax rate':'first',\n",
+    "                                        'overall tax paid':'first',\n",
+    "                                        'income':'first'\n",
+    "                                                                })\n",
+    "\n",
+    "print(df2.describe())\n",
+    "print(\"------------------------------------------------------------------------\")\n",
+    "print(df2)"
   ]
  },
  {
@ -489,12 +468,14 @@
   "execution_count": null,
   "metadata": {},
   "outputs": [],
-   "source": []
+   "source": [
+    "#compare taxes paid by queers to taxes paid by general"
+   ]
  }
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
@ -508,7 +489,12 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.11.1"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "b2baa059f790e7ad780c83135aaea020c73a7a7a6921010b599b8b664933698d"
+   }
  }
 },
 "nbformat": 4,