Save data sets to the raw/processed folders
As discussed in #6, the IRS data set is too big for GitHub to handle. So, some processing has been added to analysis/analisys2.ipynb to split the data set into two parts and stored in the processed folder
This commit is contained in:
parent
3aa5db8d24
commit
e242fed907
|
@ -26,7 +26,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -231,7 +231,7 @@
|
|||
"[5 rows x 29 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 1,
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -243,6 +243,247 @@
|
|||
"gaybourhoods.head(5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Data wrangling"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# NOTE: This cell will not work unless this file is in the repository. The source\n",
|
||||
"# can be found linked in the references section of the readme, however, it is too\n",
|
||||
"# big for GitHub to handle.\n",
|
||||
"irs = pd.read_csv(\"../data/raw/irs_2015.csv\")\n",
|
||||
"\n",
|
||||
"# Naively splitting the IRS data set in two. More formal data wrangling will\n",
|
||||
"# come later\n",
|
||||
"irs1 = irs.head(int(irs.shape[0] / 2))\n",
|
||||
"irs2 = irs.tail(int(irs.shape[0] / 2))\n",
|
||||
"\n",
|
||||
"irs1.to_csv(\"../data/processed/irs_2015_1\", index=False)\n",
|
||||
"irs2.to_csv(\"../data/processed/irs_2015_2\", index=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>STATEFIPS</th>\n",
|
||||
" <th>STATE</th>\n",
|
||||
" <th>zipcode</th>\n",
|
||||
" <th>agi_stub</th>\n",
|
||||
" <th>N1</th>\n",
|
||||
" <th>mars1</th>\n",
|
||||
" <th>MARS2</th>\n",
|
||||
" <th>MARS4</th>\n",
|
||||
" <th>PREP</th>\n",
|
||||
" <th>N2</th>\n",
|
||||
" <th>...</th>\n",
|
||||
" <th>N10300</th>\n",
|
||||
" <th>A10300</th>\n",
|
||||
" <th>N85530</th>\n",
|
||||
" <th>A85530</th>\n",
|
||||
" <th>N85300</th>\n",
|
||||
" <th>A85300</th>\n",
|
||||
" <th>N11901</th>\n",
|
||||
" <th>A11901</th>\n",
|
||||
" <th>N11902</th>\n",
|
||||
" <th>A11902</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>AL</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>836320.0</td>\n",
|
||||
" <td>481570.0</td>\n",
|
||||
" <td>109790.0</td>\n",
|
||||
" <td>233260.0</td>\n",
|
||||
" <td>455560.0</td>\n",
|
||||
" <td>1356760.0</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>373410.0</td>\n",
|
||||
" <td>328469.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>61920.0</td>\n",
|
||||
" <td>48150.0</td>\n",
|
||||
" <td>732670.0</td>\n",
|
||||
" <td>1933120.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>AL</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>494830.0</td>\n",
|
||||
" <td>206630.0</td>\n",
|
||||
" <td>146250.0</td>\n",
|
||||
" <td>129390.0</td>\n",
|
||||
" <td>275920.0</td>\n",
|
||||
" <td>1010990.0</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>395880.0</td>\n",
|
||||
" <td>965011.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>73720.0</td>\n",
|
||||
" <td>107304.0</td>\n",
|
||||
" <td>415410.0</td>\n",
|
||||
" <td>1187403.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>AL</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>261250.0</td>\n",
|
||||
" <td>80720.0</td>\n",
|
||||
" <td>139280.0</td>\n",
|
||||
" <td>36130.0</td>\n",
|
||||
" <td>155100.0</td>\n",
|
||||
" <td>583910.0</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>251490.0</td>\n",
|
||||
" <td>1333418.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>64200.0</td>\n",
|
||||
" <td>139598.0</td>\n",
|
||||
" <td>193030.0</td>\n",
|
||||
" <td>536699.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>AL</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>166690.0</td>\n",
|
||||
" <td>28510.0</td>\n",
|
||||
" <td>124650.0</td>\n",
|
||||
" <td>10630.0</td>\n",
|
||||
" <td>99950.0</td>\n",
|
||||
" <td>423990.0</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>165320.0</td>\n",
|
||||
" <td>1414283.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>45460.0</td>\n",
|
||||
" <td>128823.0</td>\n",
|
||||
" <td>116440.0</td>\n",
|
||||
" <td>377177.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>AL</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>212660.0</td>\n",
|
||||
" <td>19520.0</td>\n",
|
||||
" <td>184320.0</td>\n",
|
||||
" <td>4830.0</td>\n",
|
||||
" <td>126860.0</td>\n",
|
||||
" <td>589490.0</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>212000.0</td>\n",
|
||||
" <td>3820152.0</td>\n",
|
||||
" <td>420.0</td>\n",
|
||||
" <td>168.0</td>\n",
|
||||
" <td>60.0</td>\n",
|
||||
" <td>31.0</td>\n",
|
||||
" <td>83330.0</td>\n",
|
||||
" <td>421004.0</td>\n",
|
||||
" <td>121570.0</td>\n",
|
||||
" <td>483682.0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>5 rows × 131 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" STATEFIPS STATE zipcode agi_stub N1 mars1 MARS2 MARS4 \\\n",
|
||||
"0 1 AL 0 1 836320.0 481570.0 109790.0 233260.0 \n",
|
||||
"1 1 AL 0 2 494830.0 206630.0 146250.0 129390.0 \n",
|
||||
"2 1 AL 0 3 261250.0 80720.0 139280.0 36130.0 \n",
|
||||
"3 1 AL 0 4 166690.0 28510.0 124650.0 10630.0 \n",
|
||||
"4 1 AL 0 5 212660.0 19520.0 184320.0 4830.0 \n",
|
||||
"\n",
|
||||
" PREP N2 ... N10300 A10300 N85530 A85530 N85300 \\\n",
|
||||
"0 455560.0 1356760.0 ... 373410.0 328469.0 0.0 0.0 0.0 \n",
|
||||
"1 275920.0 1010990.0 ... 395880.0 965011.0 0.0 0.0 0.0 \n",
|
||||
"2 155100.0 583910.0 ... 251490.0 1333418.0 0.0 0.0 0.0 \n",
|
||||
"3 99950.0 423990.0 ... 165320.0 1414283.0 0.0 0.0 0.0 \n",
|
||||
"4 126860.0 589490.0 ... 212000.0 3820152.0 420.0 168.0 60.0 \n",
|
||||
"\n",
|
||||
" A85300 N11901 A11901 N11902 A11902 \n",
|
||||
"0 0.0 61920.0 48150.0 732670.0 1933120.0 \n",
|
||||
"1 0.0 73720.0 107304.0 415410.0 1187403.0 \n",
|
||||
"2 0.0 64200.0 139598.0 193030.0 536699.0 \n",
|
||||
"3 0.0 45460.0 128823.0 116440.0 377177.0 \n",
|
||||
"4 31.0 83330.0 421004.0 121570.0 483682.0 \n",
|
||||
"\n",
|
||||
"[5 rows x 131 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Now these two datasets can be joined and worked with\n",
|
||||
"irs = pd.concat([\n",
|
||||
" pd.read_csv(\"../data/processed/irs_2015_1\"),\n",
|
||||
" pd.read_csv(\"../data/processed/irs_2015_2\")\n",
|
||||
"])\n",
|
||||
"irs.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue