Perform preliminary data wrangling for #9

This commit is contained in:
Nat 2023-03-01 20:25:34 -08:00
parent 6ee345d8d3
commit 4f086a23b9
Signed by: nat
GPG Key ID: B53AB05285D710D6
4 changed files with 14613 additions and 164 deletions

View File

@ -29,7 +29,24 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import seaborn as sns"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data Wrangling"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -53,205 +70,421 @@
" <thead>\n", " <thead>\n",
" <tr style=\"text-align: right;\">\n", " <tr style=\"text-align: right;\">\n",
" <th></th>\n", " <th></th>\n",
" <th>GEOID10</th>\n", " <th>name</th>\n",
" <th>Tax_Mjoint</th>\n", " <th>lat</th>\n",
" <th>Mjoint_MF</th>\n", " <th>long</th>\n",
" <th>Mjoint_SS</th>\n",
" <th>Mjoint_FF</th>\n",
" <th>Mjoint_MM</th>\n",
" <th>TaxRate_SS</th>\n",
" <th>TaxRate_FF</th>\n",
" <th>TaxRate_MM</th>\n",
" <th>Cns_TotHH</th>\n",
" <th>...</th>\n",
" <th>FF_Cns</th>\n",
" <th>FF_Index</th>\n",
" <th>MM_Tax</th>\n",
" <th>MM_Cns</th>\n",
" <th>MM_Index</th>\n",
" <th>SS_Index</th>\n",
" <th>SS_Index_Weight</th>\n",
" <th>Parade_Weight</th>\n",
" <th>Bars_Weight</th>\n",
" <th>TOTINDEX</th>\n",
" </tr>\n", " </tr>\n",
" </thead>\n", " </thead>\n",
" <tbody>\n", " <tbody>\n",
" <tr>\n", " <tr>\n",
" <th>0</th>\n", " <th>0</th>\n",
" <td>90069</td>\n", " <td>Hancock OH</td>\n",
" <td>2120</td>\n", " <td>41.000471</td>\n",
" <td>1689</td>\n", " <td>-83.666033</td>\n",
" <td>431</td>\n",
" <td>61</td>\n",
" <td>370</td>\n",
" <td>203.301887</td>\n",
" <td>28.773585</td>\n",
" <td>174.528302</td>\n",
" <td>12551</td>\n",
" <td>...</td>\n",
" <td>1.847099</td>\n",
" <td>6.724415</td>\n",
" <td>29.583721</td>\n",
" <td>18.704533</td>\n",
" <td>48.288254</td>\n",
" <td>55.012669</td>\n",
" <td>39.429995</td>\n",
" <td>10</td>\n",
" <td>17.647059</td>\n",
" <td>67.077054</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>1</th>\n", " <th>1</th>\n",
" <td>94114</td>\n", " <td>Stafford VA</td>\n",
" <td>5080</td>\n", " <td>38.413261</td>\n",
" <td>4036</td>\n", " <td>-77.451334</td>\n",
" <td>1044</td>\n",
" <td>170</td>\n",
" <td>874</td>\n",
" <td>205.511811</td>\n",
" <td>33.464567</td>\n",
" <td>172.047244</td>\n",
" <td>16456</td>\n",
" <td>...</td>\n",
" <td>4.161579</td>\n",
" <td>9.834048</td>\n",
" <td>29.163165</td>\n",
" <td>19.415304</td>\n",
" <td>48.578469</td>\n",
" <td>58.412517</td>\n",
" <td>41.866815</td>\n",
" <td>0</td>\n",
" <td>20.000000</td>\n",
" <td>61.866815</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>2</th>\n", " <th>2</th>\n",
" <td>10011</td>\n", " <td>Webster NE</td>\n",
" <td>5790</td>\n", " <td>40.180646</td>\n",
" <td>5166</td>\n", " <td>-98.498590</td>\n",
" <td>624</td>\n",
" <td>97</td>\n",
" <td>527</td>\n",
" <td>107.772021</td>\n",
" <td>16.753022</td>\n",
" <td>91.018998</td>\n",
" <td>29762</td>\n",
" <td>...</td>\n",
" <td>1.531029</td>\n",
" <td>4.370779</td>\n",
" <td>15.428332</td>\n",
" <td>10.932081</td>\n",
" <td>26.360413</td>\n",
" <td>30.731192</td>\n",
" <td>22.026394</td>\n",
" <td>10</td>\n",
" <td>5.882353</td>\n",
" <td>37.908747</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>3</th>\n", " <th>3</th>\n",
" <td>10014</td>\n", " <td>Dimmit TX</td>\n",
" <td>3510</td>\n", " <td>28.423587</td>\n",
" <td>3229</td>\n", " <td>-99.765871</td>\n",
" <td>281</td>\n",
" <td>74</td>\n",
" <td>207</td>\n",
" <td>80.056980</td>\n",
" <td>21.082621</td>\n",
" <td>58.974359</td>\n",
" <td>18786</td>\n",
" <td>...</td>\n",
" <td>2.482293</td>\n",
" <td>6.055939</td>\n",
" <td>9.996551</td>\n",
" <td>5.943318</td>\n",
" <td>15.939869</td>\n",
" <td>21.995808</td>\n",
" <td>15.765361</td>\n",
" <td>10</td>\n",
" <td>11.764706</td>\n",
" <td>37.530067</td>\n",
" </tr>\n", " </tr>\n",
" <tr>\n", " <tr>\n",
" <th>4</th>\n", " <th>4</th>\n",
" <td>94103</td>\n", " <td>Cedar IA</td>\n",
" <td>2660</td>\n", " <td>41.772360</td>\n",
" <td>2417</td>\n", " <td>-91.132610</td>\n",
" <td>243</td>\n",
" <td>34</td>\n",
" <td>209</td>\n",
" <td>91.353383</td>\n",
" <td>12.781955</td>\n",
" <td>78.571429</td>\n",
" <td>12728</td>\n",
" <td>...</td>\n",
" <td>0.837431</td>\n",
" <td>3.004058</td>\n",
" <td>13.318386</td>\n",
" <td>4.961779</td>\n",
" <td>18.280165</td>\n",
" <td>21.284224</td>\n",
" <td>15.255337</td>\n",
" <td>10</td>\n",
" <td>10.588235</td>\n",
" <td>35.843573</td>\n",
" </tr>\n", " </tr>\n",
" </tbody>\n", " </tbody>\n",
"</table>\n", "</table>\n",
"<p>5 rows × 29 columns</p>\n",
"</div>" "</div>"
], ],
"text/plain": [ "text/plain": [
" GEOID10 Tax_Mjoint Mjoint_MF Mjoint_SS Mjoint_FF Mjoint_MM \\\n", " name lat long\n",
"0 90069 2120 1689 431 61 370 \n", "0 Hancock OH 41.000471 -83.666033\n",
"1 94114 5080 4036 1044 170 874 \n", "1 Stafford VA 38.413261 -77.451334\n",
"2 10011 5790 5166 624 97 527 \n", "2 Webster NE 40.180646 -98.498590\n",
"3 10014 3510 3229 281 74 207 \n", "3 Dimmit TX 28.423587 -99.765871\n",
"4 94103 2660 2417 243 34 209 \n", "4 Cedar IA 41.772360 -91.132610"
"\n",
" TaxRate_SS TaxRate_FF TaxRate_MM Cns_TotHH ... FF_Cns FF_Index \\\n",
"0 203.301887 28.773585 174.528302 12551 ... 1.847099 6.724415 \n",
"1 205.511811 33.464567 172.047244 16456 ... 4.161579 9.834048 \n",
"2 107.772021 16.753022 91.018998 29762 ... 1.531029 4.370779 \n",
"3 80.056980 21.082621 58.974359 18786 ... 2.482293 6.055939 \n",
"4 91.353383 12.781955 78.571429 12728 ... 0.837431 3.004058 \n",
"\n",
" MM_Tax MM_Cns MM_Index SS_Index SS_Index_Weight Parade_Weight \\\n",
"0 29.583721 18.704533 48.288254 55.012669 39.429995 10 \n",
"1 29.163165 19.415304 48.578469 58.412517 41.866815 0 \n",
"2 15.428332 10.932081 26.360413 30.731192 22.026394 10 \n",
"3 9.996551 5.943318 15.939869 21.995808 15.765361 10 \n",
"4 13.318386 4.961779 18.280165 21.284224 15.255337 10 \n",
"\n",
" Bars_Weight TOTINDEX \n",
"0 17.647059 67.077054 \n",
"1 20.000000 61.866815 \n",
"2 5.882353 37.908747 \n",
"3 11.764706 37.530067 \n",
"4 10.588235 35.843573 \n",
"\n",
"[5 rows x 29 columns]"
] ]
}, },
"execution_count": 2, "execution_count": 76,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"import pandas as pd\n", "## counties - Relating US counties to their long/lat position on the Earth\n",
"counties = pd.read_csv(\"../data/raw/us-county-boundaries.csv\", sep=\";\")\n",
"\n", "\n",
"gaybourhoods = pd.read_csv(\"../data/raw/gaybourhoods.csv\")\n", "counties = counties.rename({\n",
"gaybourhoods.head(5)" " \"NAME\": \"name\",\n",
" \"INTPTLAT\": \"lat\",\n",
" \"INTPTLON\": \"long\",\n",
"}, axis=\"columns\")\n",
"\n",
"# Combine the county name with the state code\n",
"def combine_name_state(row):\n",
" row[\"name\"] = f\"{row['name']} {row['STUSAB']}\"\n",
" return row\n",
"\n",
"counties = counties.apply(combine_name_state, axis=\"columns\")\n",
"\n",
"# We don't need this column anymore\n",
"counties = counties.drop([\"STUSAB\"], axis=\"columns\")\n",
"\n",
"counties.to_csv(\"../data/processed/us-county-boundaries.csv\")\n",
"counties.head()"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 81,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
"source": [] {
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>county</th>\n",
" <th>party</th>\n",
" <th>votes</th>\n",
" <th>total</th>\n",
" <th>percent</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Autauga AL</td>\n",
" <td>Democrat</td>\n",
" <td>6363</td>\n",
" <td>23932</td>\n",
" <td>0.265878</td>\n",
" <td>32.532237</td>\n",
" <td>-86.646439</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Autauga AL</td>\n",
" <td>Republican</td>\n",
" <td>17379</td>\n",
" <td>23932</td>\n",
" <td>0.726183</td>\n",
" <td>32.532237</td>\n",
" <td>-86.646439</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Autauga AL</td>\n",
" <td>Other</td>\n",
" <td>190</td>\n",
" <td>23932</td>\n",
" <td>0.007939</td>\n",
" <td>32.532237</td>\n",
" <td>-86.646439</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Baldwin AL</td>\n",
" <td>Democrat</td>\n",
" <td>18424</td>\n",
" <td>85338</td>\n",
" <td>0.215894</td>\n",
" <td>30.659218</td>\n",
" <td>-87.746067</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Baldwin AL</td>\n",
" <td>Republican</td>\n",
" <td>66016</td>\n",
" <td>85338</td>\n",
" <td>0.773583</td>\n",
" <td>30.659218</td>\n",
" <td>-87.746067</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" county party votes total percent lat long\n",
"0 Autauga AL Democrat 6363 23932 0.265878 32.532237 -86.646439\n",
"1 Autauga AL Republican 17379 23932 0.726183 32.532237 -86.646439\n",
"2 Autauga AL Other 190 23932 0.007939 32.532237 -86.646439\n",
"3 Baldwin AL Democrat 18424 85338 0.215894 30.659218 -87.746067\n",
"4 Baldwin AL Republican 66016 85338 0.773583 30.659218 -87.746067"
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## pol - Election results from the 2012 American presidential election\n",
"pol = pd.read_csv(\"../data/raw/countypres_2000-2020.csv\")\n",
"\n",
"# We only want 2012--the latest election before the gb data was collected\n",
"\n",
"pol = pol[pol[\"year\"] == 2012].reset_index()\n",
"\n",
"# Get rid of undesireable columns\n",
"pol = pol.drop([\n",
" \"year\", \"state\", \"county_fips\", \"office\",\n",
" \"candidate\", \"version\", \"mode\", \"index\",\n",
"], axis=\"columns\")\n",
"\n",
"# Change the column names to make them a little more friendly\n",
"pol.rename({\n",
" \"county_name\": \"county\",\n",
" \"state_po\": \"state\",\n",
" \"candidatevotes\": \"votes\",\n",
" \"totalvotes\": \"total\"\n",
"}, axis=\"columns\", inplace=True)\n",
"\n",
"# Make cells lowercase\n",
"pol[\"county\"] = pol[\"county\"].apply(lambda x: x.capitalize())\n",
"pol[\"party\"] = pol[\"party\"].apply(lambda x: x.capitalize())\n",
"\n",
"# Combine the county name with the state code\n",
"def combine_name_state(row):\n",
" row[\"county\"] = f\"{row['county']} {row['state']}\"\n",
" return row\n",
"\n",
"pol = pol.apply(combine_name_state, axis=\"columns\")\n",
"\n",
"# Add a percent column which will be useful when graphing\n",
"pol[\"percent\"] = pol[\"votes\"] / pol[\"total\"]\n",
"\n",
"# Attach long/lat data to each row\n",
"pol = pol.merge(counties, left_on=\"county\", right_on=\"name\")\n",
"\n",
"# Now we can get rid of the state columns\n",
"pol = pol.drop([\"state\", \"name\"], axis=\"columns\")\n",
"\n",
"pol.to_csv(\"../data/processed/election-2012.csv\", index=False)\n",
"pol.head()"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Tax_Mjoint</th>\n",
" <th>TaxRate_SS</th>\n",
" <th>TaxRate_FF</th>\n",
" <th>TaxRate_MM</th>\n",
" <th>Cns_RateSS</th>\n",
" <th>Cns_RateFF</th>\n",
" <th>Cns_RateMM</th>\n",
" <th>CountBars</th>\n",
" <th>FF_Index</th>\n",
" <th>MM_Index</th>\n",
" <th>SS_Index</th>\n",
" <th>TOTINDEX</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2120</td>\n",
" <td>203.301887</td>\n",
" <td>28.773585</td>\n",
" <td>174.528302</td>\n",
" <td>77.125329</td>\n",
" <td>6.931719</td>\n",
" <td>70.193610</td>\n",
" <td>15</td>\n",
" <td>6.724415</td>\n",
" <td>48.288254</td>\n",
" <td>55.012669</td>\n",
" <td>67.077054</td>\n",
" <td>34.093828</td>\n",
" <td>-118.381697</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>5080</td>\n",
" <td>205.511811</td>\n",
" <td>33.464567</td>\n",
" <td>172.047244</td>\n",
" <td>88.478367</td>\n",
" <td>15.617404</td>\n",
" <td>72.860963</td>\n",
" <td>17</td>\n",
" <td>9.834048</td>\n",
" <td>48.578469</td>\n",
" <td>58.412517</td>\n",
" <td>61.866815</td>\n",
" <td>37.758057</td>\n",
" <td>-122.435410</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5790</td>\n",
" <td>107.772021</td>\n",
" <td>16.753022</td>\n",
" <td>91.018998</td>\n",
" <td>46.771050</td>\n",
" <td>5.745582</td>\n",
" <td>41.025469</td>\n",
" <td>5</td>\n",
" <td>4.370779</td>\n",
" <td>26.360413</td>\n",
" <td>30.731192</td>\n",
" <td>37.908747</td>\n",
" <td>40.742039</td>\n",
" <td>-74.000620</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3510</td>\n",
" <td>80.056980</td>\n",
" <td>21.082621</td>\n",
" <td>58.974359</td>\n",
" <td>31.619291</td>\n",
" <td>9.315448</td>\n",
" <td>22.303843</td>\n",
" <td>10</td>\n",
" <td>6.055939</td>\n",
" <td>15.939869</td>\n",
" <td>21.995808</td>\n",
" <td>37.530067</td>\n",
" <td>40.734012</td>\n",
" <td>-74.006746</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2660</td>\n",
" <td>91.353383</td>\n",
" <td>12.781955</td>\n",
" <td>78.571429</td>\n",
" <td>21.763042</td>\n",
" <td>3.142678</td>\n",
" <td>18.620365</td>\n",
" <td>9</td>\n",
" <td>3.004058</td>\n",
" <td>18.280165</td>\n",
" <td>21.284224</td>\n",
" <td>35.843573</td>\n",
" <td>37.773134</td>\n",
" <td>-122.411167</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Tax_Mjoint TaxRate_SS TaxRate_FF TaxRate_MM Cns_RateSS Cns_RateFF \\\n",
"0 2120 203.301887 28.773585 174.528302 77.125329 6.931719 \n",
"1 5080 205.511811 33.464567 172.047244 88.478367 15.617404 \n",
"2 5790 107.772021 16.753022 91.018998 46.771050 5.745582 \n",
"3 3510 80.056980 21.082621 58.974359 31.619291 9.315448 \n",
"4 2660 91.353383 12.781955 78.571429 21.763042 3.142678 \n",
"\n",
" Cns_RateMM CountBars FF_Index MM_Index SS_Index TOTINDEX \\\n",
"0 70.193610 15 6.724415 48.288254 55.012669 67.077054 \n",
"1 72.860963 17 9.834048 48.578469 58.412517 61.866815 \n",
"2 41.025469 5 4.370779 26.360413 30.731192 37.908747 \n",
"3 22.303843 10 6.055939 15.939869 21.995808 37.530067 \n",
"4 18.620365 9 3.004058 18.280165 21.284224 35.843573 \n",
"\n",
" lat long \n",
"0 34.093828 -118.381697 \n",
"1 37.758057 -122.435410 \n",
"2 40.742039 -74.000620 \n",
"3 40.734012 -74.006746 \n",
"4 37.773134 -122.411167 "
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## gb - the gaybourhoods dataset\n",
"gb = pd.read_csv(\"../data/raw/gaybourhoods.csv\")\n",
"cords = pd.read_csv(\"../data/raw/zip_lat_long.csv\")\n",
"\n",
"# Let's add long/lat columns to gb\n",
"gb = gb.merge(cords, left_on=\"GEOID10\", right_on=\"ZIP\")\n",
"\n",
"# Get rid of unneeded columns\n",
"gb = gb.drop([\n",
" \"Mjoint_MF\", \"Mjoint_SS\", \"Mjoint_FF\", \"Mjoint_MM\",\n",
" \"Cns_TotHH\", \"Cns_UPSS\", \"Cns_UPFF\", \"Cns_UPMM\",\n",
" \"ParadeFlag\", \"FF_Tax\", \"FF_Cns\", \"MM_Tax\", \"MM_Cns\",\n",
" \"SS_Index_Weight\", \"Parade_Weight\", \"Bars_Weight\",\n",
" \"GEOID10\", \"ZIP\",\n",
"], axis=\"columns\")\n",
"\n",
"# There's a lot of info baked into some of these columns. Especially the composite indexes.\n",
"# We'll leave their names as is for easy reference even if they're a little ugly.\n",
"gb = gb.rename({\n",
" \"LAT\": \"lat\",\n",
" \"LNG\": \"long\",\n",
"}, axis=\"columns\")\n",
"\n",
"gb.to_csv(\"../data/processed/gaybourhoods-nat.csv\")\n",
"gb.head()"
]
} }
], ],
"metadata": { "metadata": {

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff