Save data sets to the raw/processed folders

As discussed in #6, the IRS data set is too big for GitHub to handle.
So, some processing has been added to analysis/analisys2.ipynb to split
the data set into two parts and stored in the processed folder
This commit is contained in:
Nat 2023-03-01 16:10:26 -08:00
parent 3aa5db8d24
commit e242fed907
Signed by: nat
GPG Key ID: B53AB05285D710D6
5 changed files with 272706 additions and 2 deletions

View File

@ -26,7 +26,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 16,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -231,7 +231,7 @@
"[5 rows x 29 columns]" "[5 rows x 29 columns]"
] ]
}, },
"execution_count": 1, "execution_count": 16,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -243,6 +243,247 @@
"gaybourhoods.head(5)" "gaybourhoods.head(5)"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data wrangling"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# NOTE: This cell will not work unless this file is in the repository. The source\n",
"# can be found linked in the references section of the readme, however, it is too\n",
"# big for GitHub to handle.\n",
"irs = pd.read_csv(\"../data/raw/irs_2015.csv\")\n",
"\n",
"# Naively splitting the IRS data set in two. More formal data wrangling will\n",
"# come later\n",
"irs1 = irs.head(int(irs.shape[0] / 2))\n",
"irs2 = irs.tail(int(irs.shape[0] / 2))\n",
"\n",
"irs1.to_csv(\"../data/processed/irs_2015_1\", index=False)\n",
"irs2.to_csv(\"../data/processed/irs_2015_2\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>STATEFIPS</th>\n",
" <th>STATE</th>\n",
" <th>zipcode</th>\n",
" <th>agi_stub</th>\n",
" <th>N1</th>\n",
" <th>mars1</th>\n",
" <th>MARS2</th>\n",
" <th>MARS4</th>\n",
" <th>PREP</th>\n",
" <th>N2</th>\n",
" <th>...</th>\n",
" <th>N10300</th>\n",
" <th>A10300</th>\n",
" <th>N85530</th>\n",
" <th>A85530</th>\n",
" <th>N85300</th>\n",
" <th>A85300</th>\n",
" <th>N11901</th>\n",
" <th>A11901</th>\n",
" <th>N11902</th>\n",
" <th>A11902</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>AL</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>836320.0</td>\n",
" <td>481570.0</td>\n",
" <td>109790.0</td>\n",
" <td>233260.0</td>\n",
" <td>455560.0</td>\n",
" <td>1356760.0</td>\n",
" <td>...</td>\n",
" <td>373410.0</td>\n",
" <td>328469.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>61920.0</td>\n",
" <td>48150.0</td>\n",
" <td>732670.0</td>\n",
" <td>1933120.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>AL</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>494830.0</td>\n",
" <td>206630.0</td>\n",
" <td>146250.0</td>\n",
" <td>129390.0</td>\n",
" <td>275920.0</td>\n",
" <td>1010990.0</td>\n",
" <td>...</td>\n",
" <td>395880.0</td>\n",
" <td>965011.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>73720.0</td>\n",
" <td>107304.0</td>\n",
" <td>415410.0</td>\n",
" <td>1187403.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>AL</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>261250.0</td>\n",
" <td>80720.0</td>\n",
" <td>139280.0</td>\n",
" <td>36130.0</td>\n",
" <td>155100.0</td>\n",
" <td>583910.0</td>\n",
" <td>...</td>\n",
" <td>251490.0</td>\n",
" <td>1333418.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>64200.0</td>\n",
" <td>139598.0</td>\n",
" <td>193030.0</td>\n",
" <td>536699.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>AL</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>166690.0</td>\n",
" <td>28510.0</td>\n",
" <td>124650.0</td>\n",
" <td>10630.0</td>\n",
" <td>99950.0</td>\n",
" <td>423990.0</td>\n",
" <td>...</td>\n",
" <td>165320.0</td>\n",
" <td>1414283.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>45460.0</td>\n",
" <td>128823.0</td>\n",
" <td>116440.0</td>\n",
" <td>377177.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>AL</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>212660.0</td>\n",
" <td>19520.0</td>\n",
" <td>184320.0</td>\n",
" <td>4830.0</td>\n",
" <td>126860.0</td>\n",
" <td>589490.0</td>\n",
" <td>...</td>\n",
" <td>212000.0</td>\n",
" <td>3820152.0</td>\n",
" <td>420.0</td>\n",
" <td>168.0</td>\n",
" <td>60.0</td>\n",
" <td>31.0</td>\n",
" <td>83330.0</td>\n",
" <td>421004.0</td>\n",
" <td>121570.0</td>\n",
" <td>483682.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 131 columns</p>\n",
"</div>"
],
"text/plain": [
" STATEFIPS STATE zipcode agi_stub N1 mars1 MARS2 MARS4 \\\n",
"0 1 AL 0 1 836320.0 481570.0 109790.0 233260.0 \n",
"1 1 AL 0 2 494830.0 206630.0 146250.0 129390.0 \n",
"2 1 AL 0 3 261250.0 80720.0 139280.0 36130.0 \n",
"3 1 AL 0 4 166690.0 28510.0 124650.0 10630.0 \n",
"4 1 AL 0 5 212660.0 19520.0 184320.0 4830.0 \n",
"\n",
" PREP N2 ... N10300 A10300 N85530 A85530 N85300 \\\n",
"0 455560.0 1356760.0 ... 373410.0 328469.0 0.0 0.0 0.0 \n",
"1 275920.0 1010990.0 ... 395880.0 965011.0 0.0 0.0 0.0 \n",
"2 155100.0 583910.0 ... 251490.0 1333418.0 0.0 0.0 0.0 \n",
"3 99950.0 423990.0 ... 165320.0 1414283.0 0.0 0.0 0.0 \n",
"4 126860.0 589490.0 ... 212000.0 3820152.0 420.0 168.0 60.0 \n",
"\n",
" A85300 N11901 A11901 N11902 A11902 \n",
"0 0.0 61920.0 48150.0 732670.0 1933120.0 \n",
"1 0.0 73720.0 107304.0 415410.0 1187403.0 \n",
"2 0.0 64200.0 139598.0 193030.0 536699.0 \n",
"3 0.0 45460.0 128823.0 116440.0 377177.0 \n",
"4 31.0 83330.0 421004.0 121570.0 483682.0 \n",
"\n",
"[5 rows x 131 columns]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Now these two datasets can be joined and worked with\n",
"irs = pd.concat([\n",
" pd.read_csv(\"../data/processed/irs_2015_1\"),\n",
" pd.read_csv(\"../data/processed/irs_2015_2\")\n",
"])\n",
"irs.head()"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,

83350
data/processed/irs_2015_1 Normal file

File diff suppressed because it is too large Load Diff

83350
data/processed/irs_2015_2 Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

33145
data/raw/zip_lat_long.csv Normal file

File diff suppressed because it is too large Load Diff