Merge pull request #12 from ubco-W2022T2-data301/add-datasets
Add IRS, election and zip code data sets
This commit is contained in:
commit
597759da16
|
@ -30,7 +30,6 @@ Images coming soon.
|
|||
- [Men are from Chelsea, Women are from Park Slope](https://pudding.cool/2018/06/gayborhoods/)
|
||||
- The article for which the data was originally collected.
|
||||
- [The Gaybourhoods data set on Github](https://github.com/the-pudding/data/blob/master/gayborhoods/README.md)
|
||||
|
||||
Sources of (potential) secondary data sets:
|
||||
- [Data set relating US ZIP codes to their coordinates](https://www.kaggle.com/datasets/joeleichter/us-zip-codes-with-lat-and-long)
|
||||
- [Geographic situation of taxes payed in the US](https://www.irs.gov/statistics/soi-tax-stats-individual-income-tax-statistics-2018-zip-code-data-soi)
|
||||
- [Geographic situation of taxes payed in the US](https://www.irs.gov/statistics/soi-tax-stats-individual-income-tax-statistics-2015-zip-code-data-soi)
|
||||
- [County Presidential Election Returns 2000-2020](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/VOQCHQ)
|
||||
|
|
|
@ -26,7 +26,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -231,7 +231,7 @@
|
|||
"[5 rows x 29 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 1,
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -243,6 +243,247 @@
|
|||
"gaybourhoods.head(5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Data wrangling"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# NOTE: This cell will not work unless this file is in the repository. The source\n",
|
||||
"# can be found linked in the references section of the readme, however, it is too\n",
|
||||
"# big for GitHub to handle.\n",
|
||||
"irs = pd.read_csv(\"../data/raw/irs_2015.csv\")\n",
|
||||
"\n",
|
||||
"# Naively splitting the IRS data set in two. More formal data wrangling will\n",
|
||||
"# come later\n",
|
||||
"irs1 = irs.head(int(irs.shape[0] / 2))\n",
|
||||
"irs2 = irs.tail(int(irs.shape[0] / 2))\n",
|
||||
"\n",
|
||||
"irs1.to_csv(\"../data/processed/irs_2015_1\", index=False)\n",
|
||||
"irs2.to_csv(\"../data/processed/irs_2015_2\", index=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>STATEFIPS</th>\n",
|
||||
" <th>STATE</th>\n",
|
||||
" <th>zipcode</th>\n",
|
||||
" <th>agi_stub</th>\n",
|
||||
" <th>N1</th>\n",
|
||||
" <th>mars1</th>\n",
|
||||
" <th>MARS2</th>\n",
|
||||
" <th>MARS4</th>\n",
|
||||
" <th>PREP</th>\n",
|
||||
" <th>N2</th>\n",
|
||||
" <th>...</th>\n",
|
||||
" <th>N10300</th>\n",
|
||||
" <th>A10300</th>\n",
|
||||
" <th>N85530</th>\n",
|
||||
" <th>A85530</th>\n",
|
||||
" <th>N85300</th>\n",
|
||||
" <th>A85300</th>\n",
|
||||
" <th>N11901</th>\n",
|
||||
" <th>A11901</th>\n",
|
||||
" <th>N11902</th>\n",
|
||||
" <th>A11902</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>AL</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>836320.0</td>\n",
|
||||
" <td>481570.0</td>\n",
|
||||
" <td>109790.0</td>\n",
|
||||
" <td>233260.0</td>\n",
|
||||
" <td>455560.0</td>\n",
|
||||
" <td>1356760.0</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>373410.0</td>\n",
|
||||
" <td>328469.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>61920.0</td>\n",
|
||||
" <td>48150.0</td>\n",
|
||||
" <td>732670.0</td>\n",
|
||||
" <td>1933120.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>AL</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>494830.0</td>\n",
|
||||
" <td>206630.0</td>\n",
|
||||
" <td>146250.0</td>\n",
|
||||
" <td>129390.0</td>\n",
|
||||
" <td>275920.0</td>\n",
|
||||
" <td>1010990.0</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>395880.0</td>\n",
|
||||
" <td>965011.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>73720.0</td>\n",
|
||||
" <td>107304.0</td>\n",
|
||||
" <td>415410.0</td>\n",
|
||||
" <td>1187403.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>AL</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>261250.0</td>\n",
|
||||
" <td>80720.0</td>\n",
|
||||
" <td>139280.0</td>\n",
|
||||
" <td>36130.0</td>\n",
|
||||
" <td>155100.0</td>\n",
|
||||
" <td>583910.0</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>251490.0</td>\n",
|
||||
" <td>1333418.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>64200.0</td>\n",
|
||||
" <td>139598.0</td>\n",
|
||||
" <td>193030.0</td>\n",
|
||||
" <td>536699.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>AL</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>166690.0</td>\n",
|
||||
" <td>28510.0</td>\n",
|
||||
" <td>124650.0</td>\n",
|
||||
" <td>10630.0</td>\n",
|
||||
" <td>99950.0</td>\n",
|
||||
" <td>423990.0</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>165320.0</td>\n",
|
||||
" <td>1414283.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>45460.0</td>\n",
|
||||
" <td>128823.0</td>\n",
|
||||
" <td>116440.0</td>\n",
|
||||
" <td>377177.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>AL</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>212660.0</td>\n",
|
||||
" <td>19520.0</td>\n",
|
||||
" <td>184320.0</td>\n",
|
||||
" <td>4830.0</td>\n",
|
||||
" <td>126860.0</td>\n",
|
||||
" <td>589490.0</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>212000.0</td>\n",
|
||||
" <td>3820152.0</td>\n",
|
||||
" <td>420.0</td>\n",
|
||||
" <td>168.0</td>\n",
|
||||
" <td>60.0</td>\n",
|
||||
" <td>31.0</td>\n",
|
||||
" <td>83330.0</td>\n",
|
||||
" <td>421004.0</td>\n",
|
||||
" <td>121570.0</td>\n",
|
||||
" <td>483682.0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>5 rows × 131 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" STATEFIPS STATE zipcode agi_stub N1 mars1 MARS2 MARS4 \\\n",
|
||||
"0 1 AL 0 1 836320.0 481570.0 109790.0 233260.0 \n",
|
||||
"1 1 AL 0 2 494830.0 206630.0 146250.0 129390.0 \n",
|
||||
"2 1 AL 0 3 261250.0 80720.0 139280.0 36130.0 \n",
|
||||
"3 1 AL 0 4 166690.0 28510.0 124650.0 10630.0 \n",
|
||||
"4 1 AL 0 5 212660.0 19520.0 184320.0 4830.0 \n",
|
||||
"\n",
|
||||
" PREP N2 ... N10300 A10300 N85530 A85530 N85300 \\\n",
|
||||
"0 455560.0 1356760.0 ... 373410.0 328469.0 0.0 0.0 0.0 \n",
|
||||
"1 275920.0 1010990.0 ... 395880.0 965011.0 0.0 0.0 0.0 \n",
|
||||
"2 155100.0 583910.0 ... 251490.0 1333418.0 0.0 0.0 0.0 \n",
|
||||
"3 99950.0 423990.0 ... 165320.0 1414283.0 0.0 0.0 0.0 \n",
|
||||
"4 126860.0 589490.0 ... 212000.0 3820152.0 420.0 168.0 60.0 \n",
|
||||
"\n",
|
||||
" A85300 N11901 A11901 N11902 A11902 \n",
|
||||
"0 0.0 61920.0 48150.0 732670.0 1933120.0 \n",
|
||||
"1 0.0 73720.0 107304.0 415410.0 1187403.0 \n",
|
||||
"2 0.0 64200.0 139598.0 193030.0 536699.0 \n",
|
||||
"3 0.0 45460.0 128823.0 116440.0 377177.0 \n",
|
||||
"4 31.0 83330.0 421004.0 121570.0 483682.0 \n",
|
||||
"\n",
|
||||
"[5 rows x 131 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Now these two datasets can be joined and worked with\n",
|
||||
"irs = pd.concat([\n",
|
||||
" pd.read_csv(\"../data/processed/irs_2015_1\"),\n",
|
||||
" pd.read_csv(\"../data/processed/irs_2015_2\")\n",
|
||||
"])\n",
|
||||
"irs.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue