From 8de27d9c5eac4575e309a748dffef1a37c416eac Mon Sep 17 00:00:00 2001 From: almsam <44277999+almsam@users.noreply.github.com> Date: Fri, 3 Mar 2023 00:51:14 -0800 Subject: [PATCH] analysis 2 wrangled tentative; anal 2 ready for plotting & visualization --- analysis/analysis2.ipynb | 406 +++++++++++++++++++-------------------- 1 file changed, 196 insertions(+), 210 deletions(-) diff --git a/analysis/analysis2.ipynb b/analysis/analysis2.ipynb index bfe6c10..f7007fb 100644 --- a/analysis/analysis2.ipynb +++ b/analysis/analysis2.ipynb @@ -26,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 43, "metadata": {}, "outputs": [ { @@ -231,13 +231,15 @@ "[5 rows x 29 columns]" ] }, - "execution_count": 16, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", + "import seaborn as sns\n", + "import numpy as np\n", "\n", "gaybourhoods = pd.read_csv(\"../data/raw/gaybourhoods.csv\")\n", "gaybourhoods.head(5)" @@ -252,13 +254,33 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 44, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '../data/raw/irs_2015.csv'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[44], line 5\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[39m# NOTE: This cell will not work unless this file is in the repository. The source\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[39m# can be found linked in the references section of the readme, however, it is too\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[39m# big for GitHub to handle.\u001b[39;00m\n\u001b[1;32m----> 5\u001b[0m irs \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39;49mread_csv(\u001b[39m\"\u001b[39;49m\u001b[39m../data/raw/irs_2015.csv\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n\u001b[0;32m 7\u001b[0m \u001b[39m# Naively splitting the IRS data set in two. More formal data wrangling will\u001b[39;00m\n\u001b[0;32m 8\u001b[0m \u001b[39m# come later\u001b[39;00m\n\u001b[0;32m 9\u001b[0m irs1 \u001b[39m=\u001b[39m irs\u001b[39m.\u001b[39mhead(\u001b[39mint\u001b[39m(irs\u001b[39m.\u001b[39mshape[\u001b[39m0\u001b[39m] \u001b[39m/\u001b[39m \u001b[39m2\u001b[39m))\n", + "File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\util\\_decorators.py:211\u001b[0m, in \u001b[0;36mdeprecate_kwarg.._deprecate_kwarg..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 209\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m 210\u001b[0m kwargs[new_arg_name] \u001b[39m=\u001b[39m new_arg_value\n\u001b[1;32m--> 211\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n", + "File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\util\\_decorators.py:331\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments..decorate..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 325\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(args) \u001b[39m>\u001b[39m num_allow_args:\n\u001b[0;32m 326\u001b[0m warnings\u001b[39m.\u001b[39mwarn(\n\u001b[0;32m 327\u001b[0m msg\u001b[39m.\u001b[39mformat(arguments\u001b[39m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[0;32m 328\u001b[0m \u001b[39mFutureWarning\u001b[39;00m,\n\u001b[0;32m 329\u001b[0m stacklevel\u001b[39m=\u001b[39mfind_stack_level(),\n\u001b[0;32m 330\u001b[0m )\n\u001b[1;32m--> 331\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n", + "File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:950\u001b[0m, in \u001b[0;36mread_csv\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[0;32m 935\u001b[0m kwds_defaults \u001b[39m=\u001b[39m _refine_defaults_read(\n\u001b[0;32m 936\u001b[0m dialect,\n\u001b[0;32m 937\u001b[0m delimiter,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 946\u001b[0m defaults\u001b[39m=\u001b[39m{\u001b[39m\"\u001b[39m\u001b[39mdelimiter\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39m,\u001b[39m\u001b[39m\"\u001b[39m},\n\u001b[0;32m 947\u001b[0m )\n\u001b[0;32m 948\u001b[0m kwds\u001b[39m.\u001b[39mupdate(kwds_defaults)\n\u001b[1;32m--> 950\u001b[0m \u001b[39mreturn\u001b[39;00m _read(filepath_or_buffer, kwds)\n", + "File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:605\u001b[0m, in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m 602\u001b[0m _validate_names(kwds\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mnames\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mNone\u001b[39;00m))\n\u001b[0;32m 604\u001b[0m \u001b[39m# Create the parser.\u001b[39;00m\n\u001b[1;32m--> 605\u001b[0m parser \u001b[39m=\u001b[39m TextFileReader(filepath_or_buffer, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwds)\n\u001b[0;32m 607\u001b[0m \u001b[39mif\u001b[39;00m chunksize \u001b[39mor\u001b[39;00m iterator:\n\u001b[0;32m 608\u001b[0m \u001b[39mreturn\u001b[39;00m parser\n", + "File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1442\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[1;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[0;32m 1439\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39moptions[\u001b[39m\"\u001b[39m\u001b[39mhas_index_names\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m kwds[\u001b[39m\"\u001b[39m\u001b[39mhas_index_names\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[0;32m 1441\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles: IOHandles \u001b[39m|\u001b[39m \u001b[39mNone\u001b[39;00m \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m-> 1442\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_engine \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_make_engine(f, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mengine)\n", + "File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\io\\parsers\\readers.py:1735\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[1;34m(self, f, engine)\u001b[0m\n\u001b[0;32m 1733\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m mode:\n\u001b[0;32m 1734\u001b[0m mode \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m-> 1735\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles \u001b[39m=\u001b[39m get_handle(\n\u001b[0;32m 1736\u001b[0m f,\n\u001b[0;32m 1737\u001b[0m mode,\n\u001b[0;32m 1738\u001b[0m encoding\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mencoding\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[0;32m 1739\u001b[0m compression\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mcompression\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[0;32m 1740\u001b[0m memory_map\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mmemory_map\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mFalse\u001b[39;49;00m),\n\u001b[0;32m 1741\u001b[0m is_text\u001b[39m=\u001b[39;49mis_text,\n\u001b[0;32m 1742\u001b[0m errors\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mencoding_errors\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mstrict\u001b[39;49m\u001b[39m\"\u001b[39;49m),\n\u001b[0;32m 1743\u001b[0m storage_options\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mstorage_options\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[0;32m 1744\u001b[0m )\n\u001b[0;32m 1745\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m 1746\u001b[0m f \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles\u001b[39m.\u001b[39mhandle\n", + "File \u001b[1;32mc:\\Users\\samia\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas\\io\\common.py:856\u001b[0m, in \u001b[0;36mget_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[0;32m 851\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(handle, \u001b[39mstr\u001b[39m):\n\u001b[0;32m 852\u001b[0m \u001b[39m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[0;32m 853\u001b[0m \u001b[39m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[0;32m 854\u001b[0m \u001b[39mif\u001b[39;00m ioargs\u001b[39m.\u001b[39mencoding \u001b[39mand\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m ioargs\u001b[39m.\u001b[39mmode:\n\u001b[0;32m 855\u001b[0m \u001b[39m# Encoding\u001b[39;00m\n\u001b[1;32m--> 856\u001b[0m handle \u001b[39m=\u001b[39m \u001b[39mopen\u001b[39m(\n\u001b[0;32m 857\u001b[0m handle,\n\u001b[0;32m 858\u001b[0m ioargs\u001b[39m.\u001b[39mmode,\n\u001b[0;32m 859\u001b[0m encoding\u001b[39m=\u001b[39mioargs\u001b[39m.\u001b[39mencoding,\n\u001b[0;32m 860\u001b[0m errors\u001b[39m=\u001b[39merrors,\n\u001b[0;32m 861\u001b[0m newline\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m 862\u001b[0m )\n\u001b[0;32m 863\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m 864\u001b[0m \u001b[39m# Binary mode\u001b[39;00m\n\u001b[0;32m 865\u001b[0m handle \u001b[39m=\u001b[39m \u001b[39mopen\u001b[39m(handle, ioargs\u001b[39m.\u001b[39mmode)\n", + "\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../data/raw/irs_2015.csv'" + ] + } + ], "source": [ "# NOTE: This cell will not work unless this file is in the repository. The source\n", "# can be found linked in the references section of the readme, however, it is too\n", "# big for GitHub to handle.\n", + "\n", "irs = pd.read_csv(\"../data/raw/irs_2015.csv\")\n", "\n", "# Naively splitting the IRS data set in two. More formal data wrangling will\n", @@ -272,216 +294,173 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 45, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
STATEFIPSSTATEzipcodeagi_stubN1mars1MARS2MARS4PREPN2...N10300A10300N85530A85530N85300A85300N11901A11901N11902A11902
01AL01836320.0481570.0109790.0233260.0455560.01356760.0...373410.0328469.00.00.00.00.061920.048150.0732670.01933120.0
11AL02494830.0206630.0146250.0129390.0275920.01010990.0...395880.0965011.00.00.00.00.073720.0107304.0415410.01187403.0
21AL03261250.080720.0139280.036130.0155100.0583910.0...251490.01333418.00.00.00.00.064200.0139598.0193030.0536699.0
31AL04166690.028510.0124650.010630.099950.0423990.0...165320.01414283.00.00.00.00.045460.0128823.0116440.0377177.0
41AL05212660.019520.0184320.04830.0126860.0589490.0...212000.03820152.0420.0168.060.031.083330.0421004.0121570.0483682.0
\n", - "

5 rows × 131 columns

\n", - "
" - ], - "text/plain": [ - " STATEFIPS STATE zipcode agi_stub N1 mars1 MARS2 MARS4 \\\n", - "0 1 AL 0 1 836320.0 481570.0 109790.0 233260.0 \n", - "1 1 AL 0 2 494830.0 206630.0 146250.0 129390.0 \n", - "2 1 AL 0 3 261250.0 80720.0 139280.0 36130.0 \n", - "3 1 AL 0 4 166690.0 28510.0 124650.0 10630.0 \n", - "4 1 AL 0 5 212660.0 19520.0 184320.0 4830.0 \n", - "\n", - " PREP N2 ... N10300 A10300 N85530 A85530 N85300 \\\n", - "0 455560.0 1356760.0 ... 373410.0 328469.0 0.0 0.0 0.0 \n", - "1 275920.0 1010990.0 ... 395880.0 965011.0 0.0 0.0 0.0 \n", - "2 155100.0 583910.0 ... 251490.0 1333418.0 0.0 0.0 0.0 \n", - "3 99950.0 423990.0 ... 165320.0 1414283.0 0.0 0.0 0.0 \n", - "4 126860.0 589490.0 ... 212000.0 3820152.0 420.0 168.0 60.0 \n", - "\n", - " A85300 N11901 A11901 N11902 A11902 \n", - "0 0.0 61920.0 48150.0 732670.0 1933120.0 \n", - "1 0.0 73720.0 107304.0 415410.0 1187403.0 \n", - "2 0.0 64200.0 139598.0 193030.0 536699.0 \n", - "3 0.0 45460.0 128823.0 116440.0 377177.0 \n", - "4 31.0 83330.0 421004.0 121570.0 483682.0 \n", - "\n", - "[5 rows x 131 columns]" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Now these two datasets can be joined and worked with\n", "irs = pd.concat([\n", " pd.read_csv(\"../data/processed/irs_2015_1\"),\n", " pd.read_csv(\"../data/processed/irs_2015_2\")\n", "])\n", - "irs.head()" + "# irs.head()\n", + "\n", + "\n", + "#selected data: ZIPCODE - this will be used in conjunction with the rest of the set\n", + " # N2 - population of zip code\n", + " \n", + " #data of intrest\n", + " # A11900\tTotal overpayments amount\n", + " # AGI_STUB - metric for income\n", + "\n", + "# print(irs.loc[irs['zipcode']==90069])\n", + "# df = {irs['zipcode'], irs['N2']}\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " zip population income overall tax paid\n", + "count 166698.000000 1.666980e+05 166698.00000 1.666980e+05\n", + "mean 48877.636432 3.432536e+03 3.50000 1.844871e+03\n", + "std 27146.337114 6.676873e+04 1.70783 5.785610e+04\n", + "min 0.000000 0.000000e+00 1.00000 0.000000e+00\n", + "25% 27040.000000 1.400000e+02 2.00000 1.600000e+01\n", + "50% 48879.000000 5.100000e+02 3.50000 1.440000e+02\n", + "75% 70607.000000 2.000000e+03 5.00000 6.310000e+02\n", + "max 99999.000000 9.566490e+06 6.00000 1.557123e+07\n", + " zip population income overall tax paid\n", + "0 0 1356760.0 1 48150.0\n", + "1 0 1010990.0 2 107304.0\n", + "2 0 583910.0 3 139598.0\n", + "3 0 423990.0 4 128823.0\n", + "4 0 589490.0 5 421004.0\n", + "... ... ... ... ...\n", + "166693 99999 6660.0 2 869.0\n", + "166694 99999 5440.0 3 1273.0\n", + "166695 99999 4780.0 4 1635.0\n", + "166696 99999 6930.0 5 5576.0\n", + "166697 99999 1890.0 6 14487.0\n", + "\n", + "[166698 rows x 4 columns]\n" + ] + } + ], + "source": [ + "#wrangle tax\n", + "taxdf = pd.DataFrame(zip(irs['zipcode'], irs['N2'], irs['agi_stub'], irs['A11901']))\n", + "taxdf.columns=('zip', 'population', 'income', 'overall tax paid')\n", + "print(taxdf.describe())\n", + "print(taxdf)\n", + "# print(irs.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " zip gay tax rate\n", + "count 2328.000000 2328.000000\n", + "mean 48616.478522 4103.440722\n", + "std 35481.240641 3140.699446\n", + "min 1730.000000 0.000000\n", + "25% 11362.750000 1767.500000\n", + "50% 46351.000000 3635.000000\n", + "75% 80234.250000 5745.000000\n", + "max 98686.000000 24560.000000\n", + " zip gay tax rate\n", + "0 90069 2120\n", + "1 94114 5080\n", + "2 10011 5790\n", + "3 10014 3510\n", + "4 94103 2660\n", + "... ... ...\n", + "2323 97208 0\n", + "2324 98154 0\n", + "2325 98158 0\n", + "2326 98174 0\n", + "2327 98195 0\n", + "\n", + "[2328 rows x 2 columns]\n" + ] + } + ], + "source": [ + "#wrangle gay\n", + "gaydf = pd.DataFrame(zip(gaybourhoods['GEOID10'], gaybourhoods['Tax_Mjoint']))\n", + "gaydf.columns=(('zip', 'gay tax rate'))\n", + "\n", + "print(gaydf.describe())\n", + "print(gaydf)\n", + "\n", + "# gaybourhoods.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " zip population gay tax rate overall tax paid income\n", + "count 2184.000000 2184.000000 2184.000000 2184.000000 2184.0\n", + "mean 48935.203297 26691.730769 4373.997253 596.719322 1.0\n", + "std 35451.335807 17960.713867 3054.620840 615.174358 0.0\n", + "min 1730.000000 160.000000 0.000000 0.000000 1.0\n", + "25% 11360.750000 13337.500000 2110.000000 217.000000 1.0\n", + "50% 60023.500000 24070.000000 3900.000000 434.000000 1.0\n", + "75% 80227.250000 35640.000000 5902.500000 777.250000 1.0\n", + "max 98686.000000 114420.000000 24560.000000 9166.000000 1.0\n", + "------------------------------------------------------------------------\n", + " zip population gay tax rate overall tax paid income\n", + "zip \n", + "1730 1730 13570.0 3260 150.0 1\n", + "1731 1731 2450.0 550 0.0 1\n", + "1742 1742 17170.0 4220 297.0 1\n", + "1760 1760 34350.0 7880 468.0 1\n", + "1770 1770 4310.0 1060 46.0 1\n", + "... ... ... ... ... ...\n", + "98682 98682 57010.0 11080 703.0 1\n", + "98683 98683 30700.0 6470 358.0 1\n", + "98684 98684 27630.0 5390 371.0 1\n", + "98685 98685 27540.0 6490 298.0 1\n", + "98686 98686 17800.0 4120 215.0 1\n", + "\n", + "[2184 rows x 5 columns]\n" + ] + } + ], + "source": [ + "#merge\n", + "df = pd.merge(taxdf, gaydf)\n", + "\n", + "# print(df)\n", + "\n", + "df2 = df.groupby(df['zip']).aggregate({ 'zip':'first',\n", + " 'population': 'sum',\n", + " 'gay tax rate':'first',\n", + " 'overall tax paid':'first',\n", + " 'income':'first'\n", + " })\n", + "\n", + "print(df2.describe())\n", + "print(\"------------------------------------------------------------------------\")\n", + "print(df2)" ] }, { @@ -489,12 +468,14 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "#compare taxes paid by queers to taxes paid by general" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -508,7 +489,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.11.1" + }, + "vscode": { + "interpreter": { + "hash": "b2baa059f790e7ad780c83135aaea020c73a7a7a6921010b599b8b664933698d" + } } }, "nbformat": 4,