analysis 2 wrangled

tentative; anal 2 ready for plotting & visualization
This commit is contained in:
almsam 2023-03-03 00:51:14 -08:00
parent f0fd88fb08
commit 8de27d9c5e
1 changed files with 196 additions and 210 deletions

View File

@ -26,7 +26,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 16, "execution_count": 43,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -231,13 +231,15 @@
"[5 rows x 29 columns]" "[5 rows x 29 columns]"
] ]
}, },
"execution_count": 16, "execution_count": 43,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"import pandas as pd\n", "import pandas as pd\n",
"import seaborn as sns\n",
"import numpy as np\n",
"\n", "\n",
"gaybourhoods = pd.read_csv(\"../data/raw/gaybourhoods.csv\")\n", "gaybourhoods = pd.read_csv(\"../data/raw/gaybourhoods.csv\")\n",
"gaybourhoods.head(5)" "gaybourhoods.head(5)"
@ -252,13 +254,33 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 19, "execution_count": 44,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: '../data/raw/irs_2015.csv'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[44], line 5\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[39m# NOTE: This cell will not work unless this file is in the repository. The source\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[39m# can be found linked in the references section of the readme, however, it is too\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[39m# big for GitHub to handle.\u001b[39;00m\n\u001b[1;32m----> 5\u001b[0m irs \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39;49mread_csv(\u001b[39m\"\u001b[39;49m\u001b[39m../data/raw/irs_2015.csv\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n\u001b[0;32m 7\u001b[0m \u001b[39m# Naively splitting the IRS data set in two. More formal data wrangling will\u001b[39;00m\n\u001b[0;32m 8\u001b[0m \u001b[39m# come later\u001b[39;00m\n\u001b[0;32m 9\u001b[0m irs1 \u001b[39m=\u001b[39m irs\u001b[39m.\u001b[39mhead(\u001b[39mint\u001b[39m(irs\u001b[39m.\u001b[39mshape[\u001b[39m0\u001b[39m] \u001b[39m/\u001b[39m \u001b[39m2\u001b[39m))\n",
"File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\util\\_decorators.py:211\u001b[0m, in \u001b[0;36mdeprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 209\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m 210\u001b[0m kwargs[new_arg_name] \u001b[39m=\u001b[39m new_arg_value\n\u001b[1;32m--> 211\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
"File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\util\\_decorators.py:331\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 325\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(args) \u001b[39m>\u001b[39m num_allow_args:\n\u001b[0;32m 326\u001b[0m warnings\u001b[39m.\u001b[39mwarn(\n\u001b[0;32m 327\u001b[0m msg\u001b[39m.\u001b[39mformat(arguments\u001b[39m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[0;32m 328\u001b[0m \u001b[39mFutureWarning\u001b[39;00m,\n\u001b[0;32m 329\u001b[0m stacklevel\u001b[39m=\u001b[39mfind_stack_level(),\n\u001b[0;32m 330\u001b[0m )\n\u001b[1;32m--> 331\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
"File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:950\u001b[0m, in \u001b[0;36mread_csv\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[0;32m 935\u001b[0m kwds_defaults \u001b[39m=\u001b[39m _refine_defaults_read(\n\u001b[0;32m 936\u001b[0m dialect,\n\u001b[0;32m 937\u001b[0m delimiter,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 946\u001b[0m defaults\u001b[39m=\u001b[39m{\u001b[39m\"\u001b[39m\u001b[39mdelimiter\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39m,\u001b[39m\u001b[39m\"\u001b[39m},\n\u001b[0;32m 947\u001b[0m )\n\u001b[0;32m 948\u001b[0m kwds\u001b[39m.\u001b[39mupdate(kwds_defaults)\n\u001b[1;32m--> 950\u001b[0m \u001b[39mreturn\u001b[39;00m _read(filepath_or_buffer, kwds)\n",
"File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:605\u001b[0m, in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m 602\u001b[0m _validate_names(kwds\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mnames\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mNone\u001b[39;00m))\n\u001b[0;32m 604\u001b[0m \u001b[39m# Create the parser.\u001b[39;00m\n\u001b[1;32m--> 605\u001b[0m parser \u001b[39m=\u001b[39m TextFileReader(filepath_or_buffer, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwds)\n\u001b[0;32m 607\u001b[0m \u001b[39mif\u001b[39;00m chunksize \u001b[39mor\u001b[39;00m iterator:\n\u001b[0;32m 608\u001b[0m \u001b[39mreturn\u001b[39;00m parser\n",
"File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1442\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[1;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[0;32m 1439\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39moptions[\u001b[39m\"\u001b[39m\u001b[39mhas_index_names\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m kwds[\u001b[39m\"\u001b[39m\u001b[39mhas_index_names\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[0;32m 1441\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles: IOHandles \u001b[39m|\u001b[39m \u001b[39mNone\u001b[39;00m \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m-> 1442\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_engine \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_make_engine(f, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mengine)\n",
"File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1735\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[1;34m(self, f, engine)\u001b[0m\n\u001b[0;32m 1733\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m mode:\n\u001b[0;32m 1734\u001b[0m mode \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m-> 1735\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles \u001b[39m=\u001b[39m get_handle(\n\u001b[0;32m 1736\u001b[0m f,\n\u001b[0;32m 1737\u001b[0m mode,\n\u001b[0;32m 1738\u001b[0m encoding\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mencoding\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[0;32m 1739\u001b[0m compression\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mcompression\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[0;32m 1740\u001b[0m memory_map\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mmemory_map\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mFalse\u001b[39;49;00m),\n\u001b[0;32m 1741\u001b[0m is_text\u001b[39m=\u001b[39;49mis_text,\n\u001b[0;32m 1742\u001b[0m errors\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mencoding_errors\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mstrict\u001b[39;49m\u001b[39m\"\u001b[39;49m),\n\u001b[0;32m 1743\u001b[0m storage_options\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mstorage_options\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[0;32m 1744\u001b[0m )\n\u001b[0;32m 1745\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m 1746\u001b[0m f \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles\u001b[39m.\u001b[39mhandle\n",
"File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\io\\common.py:856\u001b[0m, in \u001b[0;36mget_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[0;32m 851\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(handle, \u001b[39mstr\u001b[39m):\n\u001b[0;32m 852\u001b[0m \u001b[39m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[0;32m 853\u001b[0m \u001b[39m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[0;32m 854\u001b[0m \u001b[39mif\u001b[39;00m ioargs\u001b[39m.\u001b[39mencoding \u001b[39mand\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m ioargs\u001b[39m.\u001b[39mmode:\n\u001b[0;32m 855\u001b[0m \u001b[39m# Encoding\u001b[39;00m\n\u001b[1;32m--> 856\u001b[0m handle \u001b[39m=\u001b[39m \u001b[39mopen\u001b[39m(\n\u001b[0;32m 857\u001b[0m handle,\n\u001b[0;32m 858\u001b[0m ioargs\u001b[39m.\u001b[39mmode,\n\u001b[0;32m 859\u001b[0m encoding\u001b[39m=\u001b[39mioargs\u001b[39m.\u001b[39mencoding,\n\u001b[0;32m 860\u001b[0m errors\u001b[39m=\u001b[39merrors,\n\u001b[0;32m 861\u001b[0m newline\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m 862\u001b[0m )\n\u001b[0;32m 863\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m 864\u001b[0m \u001b[39m# Binary mode\u001b[39;00m\n\u001b[0;32m 865\u001b[0m handle \u001b[39m=\u001b[39m \u001b[39mopen\u001b[39m(handle, ioargs\u001b[39m.\u001b[39mmode)\n",
"\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../data/raw/irs_2015.csv'"
]
}
],
"source": [ "source": [
"# NOTE: This cell will not work unless this file is in the repository. The source\n", "# NOTE: This cell will not work unless this file is in the repository. The source\n",
"# can be found linked in the references section of the readme, however, it is too\n", "# can be found linked in the references section of the readme, however, it is too\n",
"# big for GitHub to handle.\n", "# big for GitHub to handle.\n",
"\n",
"irs = pd.read_csv(\"../data/raw/irs_2015.csv\")\n", "irs = pd.read_csv(\"../data/raw/irs_2015.csv\")\n",
"\n", "\n",
"# Naively splitting the IRS data set in two. More formal data wrangling will\n", "# Naively splitting the IRS data set in two. More formal data wrangling will\n",
@ -272,216 +294,173 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 20, "execution_count": 45,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>STATEFIPS</th>\n",
" <th>STATE</th>\n",
" <th>zipcode</th>\n",
" <th>agi_stub</th>\n",
" <th>N1</th>\n",
" <th>mars1</th>\n",
" <th>MARS2</th>\n",
" <th>MARS4</th>\n",
" <th>PREP</th>\n",
" <th>N2</th>\n",
" <th>...</th>\n",
" <th>N10300</th>\n",
" <th>A10300</th>\n",
" <th>N85530</th>\n",
" <th>A85530</th>\n",
" <th>N85300</th>\n",
" <th>A85300</th>\n",
" <th>N11901</th>\n",
" <th>A11901</th>\n",
" <th>N11902</th>\n",
" <th>A11902</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>AL</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>836320.0</td>\n",
" <td>481570.0</td>\n",
" <td>109790.0</td>\n",
" <td>233260.0</td>\n",
" <td>455560.0</td>\n",
" <td>1356760.0</td>\n",
" <td>...</td>\n",
" <td>373410.0</td>\n",
" <td>328469.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>61920.0</td>\n",
" <td>48150.0</td>\n",
" <td>732670.0</td>\n",
" <td>1933120.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>AL</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>494830.0</td>\n",
" <td>206630.0</td>\n",
" <td>146250.0</td>\n",
" <td>129390.0</td>\n",
" <td>275920.0</td>\n",
" <td>1010990.0</td>\n",
" <td>...</td>\n",
" <td>395880.0</td>\n",
" <td>965011.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>73720.0</td>\n",
" <td>107304.0</td>\n",
" <td>415410.0</td>\n",
" <td>1187403.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>AL</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>261250.0</td>\n",
" <td>80720.0</td>\n",
" <td>139280.0</td>\n",
" <td>36130.0</td>\n",
" <td>155100.0</td>\n",
" <td>583910.0</td>\n",
" <td>...</td>\n",
" <td>251490.0</td>\n",
" <td>1333418.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>64200.0</td>\n",
" <td>139598.0</td>\n",
" <td>193030.0</td>\n",
" <td>536699.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>AL</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>166690.0</td>\n",
" <td>28510.0</td>\n",
" <td>124650.0</td>\n",
" <td>10630.0</td>\n",
" <td>99950.0</td>\n",
" <td>423990.0</td>\n",
" <td>...</td>\n",
" <td>165320.0</td>\n",
" <td>1414283.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>45460.0</td>\n",
" <td>128823.0</td>\n",
" <td>116440.0</td>\n",
" <td>377177.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>AL</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>212660.0</td>\n",
" <td>19520.0</td>\n",
" <td>184320.0</td>\n",
" <td>4830.0</td>\n",
" <td>126860.0</td>\n",
" <td>589490.0</td>\n",
" <td>...</td>\n",
" <td>212000.0</td>\n",
" <td>3820152.0</td>\n",
" <td>420.0</td>\n",
" <td>168.0</td>\n",
" <td>60.0</td>\n",
" <td>31.0</td>\n",
" <td>83330.0</td>\n",
" <td>421004.0</td>\n",
" <td>121570.0</td>\n",
" <td>483682.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 131 columns</p>\n",
"</div>"
],
"text/plain": [
" STATEFIPS STATE zipcode agi_stub N1 mars1 MARS2 MARS4 \\\n",
"0 1 AL 0 1 836320.0 481570.0 109790.0 233260.0 \n",
"1 1 AL 0 2 494830.0 206630.0 146250.0 129390.0 \n",
"2 1 AL 0 3 261250.0 80720.0 139280.0 36130.0 \n",
"3 1 AL 0 4 166690.0 28510.0 124650.0 10630.0 \n",
"4 1 AL 0 5 212660.0 19520.0 184320.0 4830.0 \n",
"\n",
" PREP N2 ... N10300 A10300 N85530 A85530 N85300 \\\n",
"0 455560.0 1356760.0 ... 373410.0 328469.0 0.0 0.0 0.0 \n",
"1 275920.0 1010990.0 ... 395880.0 965011.0 0.0 0.0 0.0 \n",
"2 155100.0 583910.0 ... 251490.0 1333418.0 0.0 0.0 0.0 \n",
"3 99950.0 423990.0 ... 165320.0 1414283.0 0.0 0.0 0.0 \n",
"4 126860.0 589490.0 ... 212000.0 3820152.0 420.0 168.0 60.0 \n",
"\n",
" A85300 N11901 A11901 N11902 A11902 \n",
"0 0.0 61920.0 48150.0 732670.0 1933120.0 \n",
"1 0.0 73720.0 107304.0 415410.0 1187403.0 \n",
"2 0.0 64200.0 139598.0 193030.0 536699.0 \n",
"3 0.0 45460.0 128823.0 116440.0 377177.0 \n",
"4 31.0 83330.0 421004.0 121570.0 483682.0 \n",
"\n",
"[5 rows x 131 columns]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"# Now these two datasets can be joined and worked with\n", "# Now these two datasets can be joined and worked with\n",
"irs = pd.concat([\n", "irs = pd.concat([\n",
" pd.read_csv(\"../data/processed/irs_2015_1\"),\n", " pd.read_csv(\"../data/processed/irs_2015_1\"),\n",
" pd.read_csv(\"../data/processed/irs_2015_2\")\n", " pd.read_csv(\"../data/processed/irs_2015_2\")\n",
"])\n", "])\n",
"irs.head()" "# irs.head()\n",
"\n",
"\n",
"#selected data: ZIPCODE - this will be used in conjunction with the rest of the set\n",
" # N2 - population of zip code\n",
" \n",
" #data of intrest\n",
" # A11900\tTotal overpayments amount\n",
" # AGI_STUB - metric for income\n",
"\n",
"# print(irs.loc[irs['zipcode']==90069])\n",
"# df = {irs['zipcode'], irs['N2']}\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" zip population income overall tax paid\n",
"count 166698.000000 1.666980e+05 166698.00000 1.666980e+05\n",
"mean 48877.636432 3.432536e+03 3.50000 1.844871e+03\n",
"std 27146.337114 6.676873e+04 1.70783 5.785610e+04\n",
"min 0.000000 0.000000e+00 1.00000 0.000000e+00\n",
"25% 27040.000000 1.400000e+02 2.00000 1.600000e+01\n",
"50% 48879.000000 5.100000e+02 3.50000 1.440000e+02\n",
"75% 70607.000000 2.000000e+03 5.00000 6.310000e+02\n",
"max 99999.000000 9.566490e+06 6.00000 1.557123e+07\n",
" zip population income overall tax paid\n",
"0 0 1356760.0 1 48150.0\n",
"1 0 1010990.0 2 107304.0\n",
"2 0 583910.0 3 139598.0\n",
"3 0 423990.0 4 128823.0\n",
"4 0 589490.0 5 421004.0\n",
"... ... ... ... ...\n",
"166693 99999 6660.0 2 869.0\n",
"166694 99999 5440.0 3 1273.0\n",
"166695 99999 4780.0 4 1635.0\n",
"166696 99999 6930.0 5 5576.0\n",
"166697 99999 1890.0 6 14487.0\n",
"\n",
"[166698 rows x 4 columns]\n"
]
}
],
"source": [
"#wrangle tax\n",
"taxdf = pd.DataFrame(zip(irs['zipcode'], irs['N2'], irs['agi_stub'], irs['A11901']))\n",
"taxdf.columns=('zip', 'population', 'income', 'overall tax paid')\n",
"print(taxdf.describe())\n",
"print(taxdf)\n",
"# print(irs.columns)"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" zip gay tax rate\n",
"count 2328.000000 2328.000000\n",
"mean 48616.478522 4103.440722\n",
"std 35481.240641 3140.699446\n",
"min 1730.000000 0.000000\n",
"25% 11362.750000 1767.500000\n",
"50% 46351.000000 3635.000000\n",
"75% 80234.250000 5745.000000\n",
"max 98686.000000 24560.000000\n",
" zip gay tax rate\n",
"0 90069 2120\n",
"1 94114 5080\n",
"2 10011 5790\n",
"3 10014 3510\n",
"4 94103 2660\n",
"... ... ...\n",
"2323 97208 0\n",
"2324 98154 0\n",
"2325 98158 0\n",
"2326 98174 0\n",
"2327 98195 0\n",
"\n",
"[2328 rows x 2 columns]\n"
]
}
],
"source": [
"#wrangle gay\n",
"gaydf = pd.DataFrame(zip(gaybourhoods['GEOID10'], gaybourhoods['Tax_Mjoint']))\n",
"gaydf.columns=(('zip', 'gay tax rate'))\n",
"\n",
"print(gaydf.describe())\n",
"print(gaydf)\n",
"\n",
"# gaybourhoods.columns"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" zip population gay tax rate overall tax paid income\n",
"count 2184.000000 2184.000000 2184.000000 2184.000000 2184.0\n",
"mean 48935.203297 26691.730769 4373.997253 596.719322 1.0\n",
"std 35451.335807 17960.713867 3054.620840 615.174358 0.0\n",
"min 1730.000000 160.000000 0.000000 0.000000 1.0\n",
"25% 11360.750000 13337.500000 2110.000000 217.000000 1.0\n",
"50% 60023.500000 24070.000000 3900.000000 434.000000 1.0\n",
"75% 80227.250000 35640.000000 5902.500000 777.250000 1.0\n",
"max 98686.000000 114420.000000 24560.000000 9166.000000 1.0\n",
"------------------------------------------------------------------------\n",
" zip population gay tax rate overall tax paid income\n",
"zip \n",
"1730 1730 13570.0 3260 150.0 1\n",
"1731 1731 2450.0 550 0.0 1\n",
"1742 1742 17170.0 4220 297.0 1\n",
"1760 1760 34350.0 7880 468.0 1\n",
"1770 1770 4310.0 1060 46.0 1\n",
"... ... ... ... ... ...\n",
"98682 98682 57010.0 11080 703.0 1\n",
"98683 98683 30700.0 6470 358.0 1\n",
"98684 98684 27630.0 5390 371.0 1\n",
"98685 98685 27540.0 6490 298.0 1\n",
"98686 98686 17800.0 4120 215.0 1\n",
"\n",
"[2184 rows x 5 columns]\n"
]
}
],
"source": [
"#merge\n",
"df = pd.merge(taxdf, gaydf)\n",
"\n",
"# print(df)\n",
"\n",
"df2 = df.groupby(df['zip']).aggregate({ 'zip':'first',\n",
" 'population': 'sum',\n",
" 'gay tax rate':'first',\n",
" 'overall tax paid':'first',\n",
" 'income':'first'\n",
" })\n",
"\n",
"print(df2.describe())\n",
"print(\"------------------------------------------------------------------------\")\n",
"print(df2)"
] ]
}, },
{ {
@ -489,12 +468,14 @@
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [] "source": [
"#compare taxes paid by queers to taxes paid by general"
]
} }
], ],
"metadata": { "metadata": {
"kernelspec": { "kernelspec": {
"display_name": "Python 3 (ipykernel)", "display_name": "Python 3",
"language": "python", "language": "python",
"name": "python3" "name": "python3"
}, },
@ -508,7 +489,12 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.10.9" "version": "3.11.1"
},
"vscode": {
"interpreter": {
"hash": "b2baa059f790e7ad780c83135aaea020c73a7a7a6921010b599b8b664933698d"
}
} }
}, },
"nbformat": 4, "nbformat": 4,