analysis 2 wrangled
tentative; anal 2 ready for plotting & visualization
This commit is contained in:
parent
f0fd88fb08
commit
8de27d9c5e
|
@ -26,7 +26,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": 43,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -231,13 +231,15 @@
|
|||
"[5 rows x 29 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"execution_count": 43,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import seaborn as sns\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"gaybourhoods = pd.read_csv(\"../data/raw/gaybourhoods.csv\")\n",
|
||||
"gaybourhoods.head(5)"
|
||||
|
@ -252,13 +254,33 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"execution_count": 44,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "FileNotFoundError",
|
||||
"evalue": "[Errno 2] No such file or directory: '../data/raw/irs_2015.csv'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[1;32mIn[44], line 5\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[39m# NOTE: This cell will not work unless this file is in the repository. The source\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[39m# can be found linked in the references section of the readme, however, it is too\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[39m# big for GitHub to handle.\u001b[39;00m\n\u001b[1;32m----> 5\u001b[0m irs \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39;49mread_csv(\u001b[39m\"\u001b[39;49m\u001b[39m../data/raw/irs_2015.csv\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n\u001b[0;32m 7\u001b[0m \u001b[39m# Naively splitting the IRS data set in two. More formal data wrangling will\u001b[39;00m\n\u001b[0;32m 8\u001b[0m \u001b[39m# come later\u001b[39;00m\n\u001b[0;32m 9\u001b[0m irs1 \u001b[39m=\u001b[39m irs\u001b[39m.\u001b[39mhead(\u001b[39mint\u001b[39m(irs\u001b[39m.\u001b[39mshape[\u001b[39m0\u001b[39m] \u001b[39m/\u001b[39m \u001b[39m2\u001b[39m))\n",
|
||||
"File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\util\\_decorators.py:211\u001b[0m, in \u001b[0;36mdeprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 209\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m 210\u001b[0m kwargs[new_arg_name] \u001b[39m=\u001b[39m new_arg_value\n\u001b[1;32m--> 211\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
|
||||
"File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\util\\_decorators.py:331\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 325\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(args) \u001b[39m>\u001b[39m num_allow_args:\n\u001b[0;32m 326\u001b[0m warnings\u001b[39m.\u001b[39mwarn(\n\u001b[0;32m 327\u001b[0m msg\u001b[39m.\u001b[39mformat(arguments\u001b[39m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[0;32m 328\u001b[0m \u001b[39mFutureWarning\u001b[39;00m,\n\u001b[0;32m 329\u001b[0m stacklevel\u001b[39m=\u001b[39mfind_stack_level(),\n\u001b[0;32m 330\u001b[0m )\n\u001b[1;32m--> 331\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
|
||||
"File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:950\u001b[0m, in \u001b[0;36mread_csv\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[0;32m 935\u001b[0m kwds_defaults \u001b[39m=\u001b[39m _refine_defaults_read(\n\u001b[0;32m 936\u001b[0m dialect,\n\u001b[0;32m 937\u001b[0m delimiter,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 946\u001b[0m defaults\u001b[39m=\u001b[39m{\u001b[39m\"\u001b[39m\u001b[39mdelimiter\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39m,\u001b[39m\u001b[39m\"\u001b[39m},\n\u001b[0;32m 947\u001b[0m )\n\u001b[0;32m 948\u001b[0m kwds\u001b[39m.\u001b[39mupdate(kwds_defaults)\n\u001b[1;32m--> 950\u001b[0m \u001b[39mreturn\u001b[39;00m _read(filepath_or_buffer, kwds)\n",
|
||||
"File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:605\u001b[0m, in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m 602\u001b[0m _validate_names(kwds\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mnames\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mNone\u001b[39;00m))\n\u001b[0;32m 604\u001b[0m \u001b[39m# Create the parser.\u001b[39;00m\n\u001b[1;32m--> 605\u001b[0m parser \u001b[39m=\u001b[39m TextFileReader(filepath_or_buffer, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwds)\n\u001b[0;32m 607\u001b[0m \u001b[39mif\u001b[39;00m chunksize \u001b[39mor\u001b[39;00m iterator:\n\u001b[0;32m 608\u001b[0m \u001b[39mreturn\u001b[39;00m parser\n",
|
||||
"File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1442\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[1;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[0;32m 1439\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39moptions[\u001b[39m\"\u001b[39m\u001b[39mhas_index_names\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m kwds[\u001b[39m\"\u001b[39m\u001b[39mhas_index_names\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[0;32m 1441\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles: IOHandles \u001b[39m|\u001b[39m \u001b[39mNone\u001b[39;00m \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m-> 1442\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_engine \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_make_engine(f, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mengine)\n",
|
||||
"File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1735\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[1;34m(self, f, engine)\u001b[0m\n\u001b[0;32m 1733\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m mode:\n\u001b[0;32m 1734\u001b[0m mode \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m-> 1735\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles \u001b[39m=\u001b[39m get_handle(\n\u001b[0;32m 1736\u001b[0m f,\n\u001b[0;32m 1737\u001b[0m mode,\n\u001b[0;32m 1738\u001b[0m encoding\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mencoding\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[0;32m 1739\u001b[0m compression\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mcompression\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[0;32m 1740\u001b[0m memory_map\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mmemory_map\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mFalse\u001b[39;49;00m),\n\u001b[0;32m 1741\u001b[0m is_text\u001b[39m=\u001b[39;49mis_text,\n\u001b[0;32m 1742\u001b[0m errors\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mencoding_errors\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mstrict\u001b[39;49m\u001b[39m\"\u001b[39;49m),\n\u001b[0;32m 1743\u001b[0m storage_options\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mstorage_options\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[0;32m 1744\u001b[0m )\n\u001b[0;32m 1745\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m 1746\u001b[0m f \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles\u001b[39m.\u001b[39mhandle\n",
|
||||
"File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\io\\common.py:856\u001b[0m, in \u001b[0;36mget_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[0;32m 851\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(handle, \u001b[39mstr\u001b[39m):\n\u001b[0;32m 852\u001b[0m \u001b[39m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[0;32m 853\u001b[0m \u001b[39m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[0;32m 854\u001b[0m \u001b[39mif\u001b[39;00m ioargs\u001b[39m.\u001b[39mencoding \u001b[39mand\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m ioargs\u001b[39m.\u001b[39mmode:\n\u001b[0;32m 855\u001b[0m \u001b[39m# Encoding\u001b[39;00m\n\u001b[1;32m--> 856\u001b[0m handle \u001b[39m=\u001b[39m \u001b[39mopen\u001b[39m(\n\u001b[0;32m 857\u001b[0m handle,\n\u001b[0;32m 858\u001b[0m ioargs\u001b[39m.\u001b[39mmode,\n\u001b[0;32m 859\u001b[0m encoding\u001b[39m=\u001b[39mioargs\u001b[39m.\u001b[39mencoding,\n\u001b[0;32m 860\u001b[0m errors\u001b[39m=\u001b[39merrors,\n\u001b[0;32m 861\u001b[0m newline\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m 862\u001b[0m )\n\u001b[0;32m 863\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m 864\u001b[0m \u001b[39m# Binary mode\u001b[39;00m\n\u001b[0;32m 865\u001b[0m handle \u001b[39m=\u001b[39m \u001b[39mopen\u001b[39m(handle, ioargs\u001b[39m.\u001b[39mmode)\n",
|
||||
"\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../data/raw/irs_2015.csv'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# NOTE: This cell will not work unless this file is in the repository. The source\n",
|
||||
"# can be found linked in the references section of the readme, however, it is too\n",
|
||||
"# big for GitHub to handle.\n",
|
||||
"\n",
|
||||
"irs = pd.read_csv(\"../data/raw/irs_2015.csv\")\n",
|
||||
"\n",
|
||||
"# Naively splitting the IRS data set in two. More formal data wrangling will\n",
|
||||
|
@ -272,216 +294,173 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"execution_count": 45,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>STATEFIPS</th>\n",
|
||||
" <th>STATE</th>\n",
|
||||
" <th>zipcode</th>\n",
|
||||
" <th>agi_stub</th>\n",
|
||||
" <th>N1</th>\n",
|
||||
" <th>mars1</th>\n",
|
||||
" <th>MARS2</th>\n",
|
||||
" <th>MARS4</th>\n",
|
||||
" <th>PREP</th>\n",
|
||||
" <th>N2</th>\n",
|
||||
" <th>...</th>\n",
|
||||
" <th>N10300</th>\n",
|
||||
" <th>A10300</th>\n",
|
||||
" <th>N85530</th>\n",
|
||||
" <th>A85530</th>\n",
|
||||
" <th>N85300</th>\n",
|
||||
" <th>A85300</th>\n",
|
||||
" <th>N11901</th>\n",
|
||||
" <th>A11901</th>\n",
|
||||
" <th>N11902</th>\n",
|
||||
" <th>A11902</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>AL</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>836320.0</td>\n",
|
||||
" <td>481570.0</td>\n",
|
||||
" <td>109790.0</td>\n",
|
||||
" <td>233260.0</td>\n",
|
||||
" <td>455560.0</td>\n",
|
||||
" <td>1356760.0</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>373410.0</td>\n",
|
||||
" <td>328469.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>61920.0</td>\n",
|
||||
" <td>48150.0</td>\n",
|
||||
" <td>732670.0</td>\n",
|
||||
" <td>1933120.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>AL</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>494830.0</td>\n",
|
||||
" <td>206630.0</td>\n",
|
||||
" <td>146250.0</td>\n",
|
||||
" <td>129390.0</td>\n",
|
||||
" <td>275920.0</td>\n",
|
||||
" <td>1010990.0</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>395880.0</td>\n",
|
||||
" <td>965011.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>73720.0</td>\n",
|
||||
" <td>107304.0</td>\n",
|
||||
" <td>415410.0</td>\n",
|
||||
" <td>1187403.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>AL</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>261250.0</td>\n",
|
||||
" <td>80720.0</td>\n",
|
||||
" <td>139280.0</td>\n",
|
||||
" <td>36130.0</td>\n",
|
||||
" <td>155100.0</td>\n",
|
||||
" <td>583910.0</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>251490.0</td>\n",
|
||||
" <td>1333418.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>64200.0</td>\n",
|
||||
" <td>139598.0</td>\n",
|
||||
" <td>193030.0</td>\n",
|
||||
" <td>536699.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>AL</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>166690.0</td>\n",
|
||||
" <td>28510.0</td>\n",
|
||||
" <td>124650.0</td>\n",
|
||||
" <td>10630.0</td>\n",
|
||||
" <td>99950.0</td>\n",
|
||||
" <td>423990.0</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>165320.0</td>\n",
|
||||
" <td>1414283.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>45460.0</td>\n",
|
||||
" <td>128823.0</td>\n",
|
||||
" <td>116440.0</td>\n",
|
||||
" <td>377177.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>AL</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>212660.0</td>\n",
|
||||
" <td>19520.0</td>\n",
|
||||
" <td>184320.0</td>\n",
|
||||
" <td>4830.0</td>\n",
|
||||
" <td>126860.0</td>\n",
|
||||
" <td>589490.0</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>212000.0</td>\n",
|
||||
" <td>3820152.0</td>\n",
|
||||
" <td>420.0</td>\n",
|
||||
" <td>168.0</td>\n",
|
||||
" <td>60.0</td>\n",
|
||||
" <td>31.0</td>\n",
|
||||
" <td>83330.0</td>\n",
|
||||
" <td>421004.0</td>\n",
|
||||
" <td>121570.0</td>\n",
|
||||
" <td>483682.0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>5 rows × 131 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" STATEFIPS STATE zipcode agi_stub N1 mars1 MARS2 MARS4 \\\n",
|
||||
"0 1 AL 0 1 836320.0 481570.0 109790.0 233260.0 \n",
|
||||
"1 1 AL 0 2 494830.0 206630.0 146250.0 129390.0 \n",
|
||||
"2 1 AL 0 3 261250.0 80720.0 139280.0 36130.0 \n",
|
||||
"3 1 AL 0 4 166690.0 28510.0 124650.0 10630.0 \n",
|
||||
"4 1 AL 0 5 212660.0 19520.0 184320.0 4830.0 \n",
|
||||
"\n",
|
||||
" PREP N2 ... N10300 A10300 N85530 A85530 N85300 \\\n",
|
||||
"0 455560.0 1356760.0 ... 373410.0 328469.0 0.0 0.0 0.0 \n",
|
||||
"1 275920.0 1010990.0 ... 395880.0 965011.0 0.0 0.0 0.0 \n",
|
||||
"2 155100.0 583910.0 ... 251490.0 1333418.0 0.0 0.0 0.0 \n",
|
||||
"3 99950.0 423990.0 ... 165320.0 1414283.0 0.0 0.0 0.0 \n",
|
||||
"4 126860.0 589490.0 ... 212000.0 3820152.0 420.0 168.0 60.0 \n",
|
||||
"\n",
|
||||
" A85300 N11901 A11901 N11902 A11902 \n",
|
||||
"0 0.0 61920.0 48150.0 732670.0 1933120.0 \n",
|
||||
"1 0.0 73720.0 107304.0 415410.0 1187403.0 \n",
|
||||
"2 0.0 64200.0 139598.0 193030.0 536699.0 \n",
|
||||
"3 0.0 45460.0 128823.0 116440.0 377177.0 \n",
|
||||
"4 31.0 83330.0 421004.0 121570.0 483682.0 \n",
|
||||
"\n",
|
||||
"[5 rows x 131 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Now these two datasets can be joined and worked with\n",
|
||||
"irs = pd.concat([\n",
|
||||
" pd.read_csv(\"../data/processed/irs_2015_1\"),\n",
|
||||
" pd.read_csv(\"../data/processed/irs_2015_2\")\n",
|
||||
"])\n",
|
||||
"irs.head()"
|
||||
"# irs.head()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"#selected data: ZIPCODE - this will be used in conjunction with the rest of the set\n",
|
||||
" # N2 - population of zip code\n",
|
||||
" \n",
|
||||
" #data of intrest\n",
|
||||
" # A11900\tTotal overpayments amount\n",
|
||||
" # AGI_STUB - metric for income\n",
|
||||
"\n",
|
||||
"# print(irs.loc[irs['zipcode']==90069])\n",
|
||||
"# df = {irs['zipcode'], irs['N2']}\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 69,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" zip population income overall tax paid\n",
|
||||
"count 166698.000000 1.666980e+05 166698.00000 1.666980e+05\n",
|
||||
"mean 48877.636432 3.432536e+03 3.50000 1.844871e+03\n",
|
||||
"std 27146.337114 6.676873e+04 1.70783 5.785610e+04\n",
|
||||
"min 0.000000 0.000000e+00 1.00000 0.000000e+00\n",
|
||||
"25% 27040.000000 1.400000e+02 2.00000 1.600000e+01\n",
|
||||
"50% 48879.000000 5.100000e+02 3.50000 1.440000e+02\n",
|
||||
"75% 70607.000000 2.000000e+03 5.00000 6.310000e+02\n",
|
||||
"max 99999.000000 9.566490e+06 6.00000 1.557123e+07\n",
|
||||
" zip population income overall tax paid\n",
|
||||
"0 0 1356760.0 1 48150.0\n",
|
||||
"1 0 1010990.0 2 107304.0\n",
|
||||
"2 0 583910.0 3 139598.0\n",
|
||||
"3 0 423990.0 4 128823.0\n",
|
||||
"4 0 589490.0 5 421004.0\n",
|
||||
"... ... ... ... ...\n",
|
||||
"166693 99999 6660.0 2 869.0\n",
|
||||
"166694 99999 5440.0 3 1273.0\n",
|
||||
"166695 99999 4780.0 4 1635.0\n",
|
||||
"166696 99999 6930.0 5 5576.0\n",
|
||||
"166697 99999 1890.0 6 14487.0\n",
|
||||
"\n",
|
||||
"[166698 rows x 4 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#wrangle tax\n",
|
||||
"taxdf = pd.DataFrame(zip(irs['zipcode'], irs['N2'], irs['agi_stub'], irs['A11901']))\n",
|
||||
"taxdf.columns=('zip', 'population', 'income', 'overall tax paid')\n",
|
||||
"print(taxdf.describe())\n",
|
||||
"print(taxdf)\n",
|
||||
"# print(irs.columns)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 70,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" zip gay tax rate\n",
|
||||
"count 2328.000000 2328.000000\n",
|
||||
"mean 48616.478522 4103.440722\n",
|
||||
"std 35481.240641 3140.699446\n",
|
||||
"min 1730.000000 0.000000\n",
|
||||
"25% 11362.750000 1767.500000\n",
|
||||
"50% 46351.000000 3635.000000\n",
|
||||
"75% 80234.250000 5745.000000\n",
|
||||
"max 98686.000000 24560.000000\n",
|
||||
" zip gay tax rate\n",
|
||||
"0 90069 2120\n",
|
||||
"1 94114 5080\n",
|
||||
"2 10011 5790\n",
|
||||
"3 10014 3510\n",
|
||||
"4 94103 2660\n",
|
||||
"... ... ...\n",
|
||||
"2323 97208 0\n",
|
||||
"2324 98154 0\n",
|
||||
"2325 98158 0\n",
|
||||
"2326 98174 0\n",
|
||||
"2327 98195 0\n",
|
||||
"\n",
|
||||
"[2328 rows x 2 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#wrangle gay\n",
|
||||
"gaydf = pd.DataFrame(zip(gaybourhoods['GEOID10'], gaybourhoods['Tax_Mjoint']))\n",
|
||||
"gaydf.columns=(('zip', 'gay tax rate'))\n",
|
||||
"\n",
|
||||
"print(gaydf.describe())\n",
|
||||
"print(gaydf)\n",
|
||||
"\n",
|
||||
"# gaybourhoods.columns"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 91,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" zip population gay tax rate overall tax paid income\n",
|
||||
"count 2184.000000 2184.000000 2184.000000 2184.000000 2184.0\n",
|
||||
"mean 48935.203297 26691.730769 4373.997253 596.719322 1.0\n",
|
||||
"std 35451.335807 17960.713867 3054.620840 615.174358 0.0\n",
|
||||
"min 1730.000000 160.000000 0.000000 0.000000 1.0\n",
|
||||
"25% 11360.750000 13337.500000 2110.000000 217.000000 1.0\n",
|
||||
"50% 60023.500000 24070.000000 3900.000000 434.000000 1.0\n",
|
||||
"75% 80227.250000 35640.000000 5902.500000 777.250000 1.0\n",
|
||||
"max 98686.000000 114420.000000 24560.000000 9166.000000 1.0\n",
|
||||
"------------------------------------------------------------------------\n",
|
||||
" zip population gay tax rate overall tax paid income\n",
|
||||
"zip \n",
|
||||
"1730 1730 13570.0 3260 150.0 1\n",
|
||||
"1731 1731 2450.0 550 0.0 1\n",
|
||||
"1742 1742 17170.0 4220 297.0 1\n",
|
||||
"1760 1760 34350.0 7880 468.0 1\n",
|
||||
"1770 1770 4310.0 1060 46.0 1\n",
|
||||
"... ... ... ... ... ...\n",
|
||||
"98682 98682 57010.0 11080 703.0 1\n",
|
||||
"98683 98683 30700.0 6470 358.0 1\n",
|
||||
"98684 98684 27630.0 5390 371.0 1\n",
|
||||
"98685 98685 27540.0 6490 298.0 1\n",
|
||||
"98686 98686 17800.0 4120 215.0 1\n",
|
||||
"\n",
|
||||
"[2184 rows x 5 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#merge\n",
|
||||
"df = pd.merge(taxdf, gaydf)\n",
|
||||
"\n",
|
||||
"# print(df)\n",
|
||||
"\n",
|
||||
"df2 = df.groupby(df['zip']).aggregate({ 'zip':'first',\n",
|
||||
" 'population': 'sum',\n",
|
||||
" 'gay tax rate':'first',\n",
|
||||
" 'overall tax paid':'first',\n",
|
||||
" 'income':'first'\n",
|
||||
" })\n",
|
||||
"\n",
|
||||
"print(df2.describe())\n",
|
||||
"print(\"------------------------------------------------------------------------\")\n",
|
||||
"print(df2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -489,12 +468,14 @@
|
|||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"source": [
|
||||
"#compare taxes paid by queers to taxes paid by general"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
|
@ -508,7 +489,12 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"version": "3.11.1"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "b2baa059f790e7ad780c83135aaea020c73a7a7a6921010b599b8b664933698d"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
Loading…
Reference in New Issue