Merge pull request #12 from ubco-W2022T2-data301/add-datasets

Add IRS, election and zip code data sets
This commit is contained in:
Nat 2023-03-02 17:27:52 -08:00 committed by GitHub
commit 597759da16
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 272708 additions and 5 deletions

View File

@ -30,7 +30,6 @@ Images coming soon.
- [Men are from Chelsea, Women are from Park Slope](https://pudding.cool/2018/06/gayborhoods/) - [Men are from Chelsea, Women are from Park Slope](https://pudding.cool/2018/06/gayborhoods/)
- The article for which the data was originally collected. - The article for which the data was originally collected.
- [The Gaybourhoods data set on Github](https://github.com/the-pudding/data/blob/master/gayborhoods/README.md) - [The Gaybourhoods data set on Github](https://github.com/the-pudding/data/blob/master/gayborhoods/README.md)
Sources of (potential) secondary data sets:
- [Data set relating US ZIP codes to their coordinates](https://www.kaggle.com/datasets/joeleichter/us-zip-codes-with-lat-and-long) - [Data set relating US ZIP codes to their coordinates](https://www.kaggle.com/datasets/joeleichter/us-zip-codes-with-lat-and-long)
- [Geographic situation of taxes payed in the US](https://www.irs.gov/statistics/soi-tax-stats-individual-income-tax-statistics-2018-zip-code-data-soi) - [Geographic situation of taxes payed in the US](https://www.irs.gov/statistics/soi-tax-stats-individual-income-tax-statistics-2015-zip-code-data-soi)
- [County Presidential Election Returns 2000-2020](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/VOQCHQ)

View File

@ -26,7 +26,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 16,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -231,7 +231,7 @@
"[5 rows x 29 columns]" "[5 rows x 29 columns]"
] ]
}, },
"execution_count": 1, "execution_count": 16,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -243,6 +243,247 @@
"gaybourhoods.head(5)" "gaybourhoods.head(5)"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data wrangling"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# NOTE: This cell will not work unless this file is in the repository. The source\n",
"# can be found linked in the references section of the readme, however, it is too\n",
"# big for GitHub to handle.\n",
"irs = pd.read_csv(\"../data/raw/irs_2015.csv\")\n",
"\n",
"# Naively splitting the IRS data set in two. More formal data wrangling will\n",
"# come later\n",
"irs1 = irs.head(int(irs.shape[0] / 2))\n",
"irs2 = irs.tail(int(irs.shape[0] / 2))\n",
"\n",
"irs1.to_csv(\"../data/processed/irs_2015_1\", index=False)\n",
"irs2.to_csv(\"../data/processed/irs_2015_2\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>STATEFIPS</th>\n",
" <th>STATE</th>\n",
" <th>zipcode</th>\n",
" <th>agi_stub</th>\n",
" <th>N1</th>\n",
" <th>mars1</th>\n",
" <th>MARS2</th>\n",
" <th>MARS4</th>\n",
" <th>PREP</th>\n",
" <th>N2</th>\n",
" <th>...</th>\n",
" <th>N10300</th>\n",
" <th>A10300</th>\n",
" <th>N85530</th>\n",
" <th>A85530</th>\n",
" <th>N85300</th>\n",
" <th>A85300</th>\n",
" <th>N11901</th>\n",
" <th>A11901</th>\n",
" <th>N11902</th>\n",
" <th>A11902</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>AL</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>836320.0</td>\n",
" <td>481570.0</td>\n",
" <td>109790.0</td>\n",
" <td>233260.0</td>\n",
" <td>455560.0</td>\n",
" <td>1356760.0</td>\n",
" <td>...</td>\n",
" <td>373410.0</td>\n",
" <td>328469.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>61920.0</td>\n",
" <td>48150.0</td>\n",
" <td>732670.0</td>\n",
" <td>1933120.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>AL</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>494830.0</td>\n",
" <td>206630.0</td>\n",
" <td>146250.0</td>\n",
" <td>129390.0</td>\n",
" <td>275920.0</td>\n",
" <td>1010990.0</td>\n",
" <td>...</td>\n",
" <td>395880.0</td>\n",
" <td>965011.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>73720.0</td>\n",
" <td>107304.0</td>\n",
" <td>415410.0</td>\n",
" <td>1187403.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>AL</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>261250.0</td>\n",
" <td>80720.0</td>\n",
" <td>139280.0</td>\n",
" <td>36130.0</td>\n",
" <td>155100.0</td>\n",
" <td>583910.0</td>\n",
" <td>...</td>\n",
" <td>251490.0</td>\n",
" <td>1333418.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>64200.0</td>\n",
" <td>139598.0</td>\n",
" <td>193030.0</td>\n",
" <td>536699.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>AL</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>166690.0</td>\n",
" <td>28510.0</td>\n",
" <td>124650.0</td>\n",
" <td>10630.0</td>\n",
" <td>99950.0</td>\n",
" <td>423990.0</td>\n",
" <td>...</td>\n",
" <td>165320.0</td>\n",
" <td>1414283.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>45460.0</td>\n",
" <td>128823.0</td>\n",
" <td>116440.0</td>\n",
" <td>377177.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>AL</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>212660.0</td>\n",
" <td>19520.0</td>\n",
" <td>184320.0</td>\n",
" <td>4830.0</td>\n",
" <td>126860.0</td>\n",
" <td>589490.0</td>\n",
" <td>...</td>\n",
" <td>212000.0</td>\n",
" <td>3820152.0</td>\n",
" <td>420.0</td>\n",
" <td>168.0</td>\n",
" <td>60.0</td>\n",
" <td>31.0</td>\n",
" <td>83330.0</td>\n",
" <td>421004.0</td>\n",
" <td>121570.0</td>\n",
" <td>483682.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 131 columns</p>\n",
"</div>"
],
"text/plain": [
" STATEFIPS STATE zipcode agi_stub N1 mars1 MARS2 MARS4 \\\n",
"0 1 AL 0 1 836320.0 481570.0 109790.0 233260.0 \n",
"1 1 AL 0 2 494830.0 206630.0 146250.0 129390.0 \n",
"2 1 AL 0 3 261250.0 80720.0 139280.0 36130.0 \n",
"3 1 AL 0 4 166690.0 28510.0 124650.0 10630.0 \n",
"4 1 AL 0 5 212660.0 19520.0 184320.0 4830.0 \n",
"\n",
" PREP N2 ... N10300 A10300 N85530 A85530 N85300 \\\n",
"0 455560.0 1356760.0 ... 373410.0 328469.0 0.0 0.0 0.0 \n",
"1 275920.0 1010990.0 ... 395880.0 965011.0 0.0 0.0 0.0 \n",
"2 155100.0 583910.0 ... 251490.0 1333418.0 0.0 0.0 0.0 \n",
"3 99950.0 423990.0 ... 165320.0 1414283.0 0.0 0.0 0.0 \n",
"4 126860.0 589490.0 ... 212000.0 3820152.0 420.0 168.0 60.0 \n",
"\n",
" A85300 N11901 A11901 N11902 A11902 \n",
"0 0.0 61920.0 48150.0 732670.0 1933120.0 \n",
"1 0.0 73720.0 107304.0 415410.0 1187403.0 \n",
"2 0.0 64200.0 139598.0 193030.0 536699.0 \n",
"3 0.0 45460.0 128823.0 116440.0 377177.0 \n",
"4 31.0 83330.0 421004.0 121570.0 483682.0 \n",
"\n",
"[5 rows x 131 columns]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Now these two datasets can be joined and worked with\n",
"irs = pd.concat([\n",
" pd.read_csv(\"../data/processed/irs_2015_1\"),\n",
" pd.read_csv(\"../data/processed/irs_2015_2\")\n",
"])\n",
"irs.head()"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,

83350
data/processed/irs_2015_1 Normal file

File diff suppressed because it is too large Load Diff

83350
data/processed/irs_2015_2 Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

33145
data/raw/zip_lat_long.csv Normal file

File diff suppressed because it is too large Load Diff