Move analysis1 chains to code.project_functions1

This commit is contained in:
Nat 2023-03-20 11:46:45 -07:00
parent 6b4e79d55e
commit 4566009cae
Signed by: nat
GPG Key ID: B53AB05285D710D6
4 changed files with 8844 additions and 42186 deletions

View File

@ -36,414 +36,39 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 49,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import pandas as pd\n", "import pandas as pd\n",
"import seaborn as sns" "import seaborn as sns\n",
"\n",
"# Absolutely diabolical method of doing relative imports with a package who shares its name with\n",
"# something in the stdlib in Jupyter Lab because it seems impossible otherwise\n",
"__import__(\"sys\").path.append(\"./code\")\n",
"from project_functions1 import *"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### Loading the data" "## Data Analysis Pipeline"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 50,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"## counties - Relating US counties to their long/lat position on the Earth\n", "# Now in one, new-and-improved, non-descript method imported from another file\n",
"counties = pd.read_csv(\"../data/raw/us-county-boundaries.csv\", sep=\";\")\n", "gb, pol, counties, cords = load_and_process()"
"\n",
"## pol - Election results from the 2012 American presidential election\n",
"pol = pd.read_csv(\"../data/raw/countypres_2000-2020.csv\")\n",
"\n",
"## gb - the gaybourhoods dataset\n",
"gb = pd.read_csv(\"../data/raw/gaybourhoods.csv\")\n",
"\n",
"# cords - mapping zip codes to long/lat coordinates\n",
"cords = pd.read_csv(\"../data/raw/zip_lat_long.csv\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Cleaning the data"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"counties = counties.rename({\n",
" \"NAME\": \"name\",\n",
" \"INTPTLAT\": \"lat\",\n",
" \"INTPTLON\": \"long\",\n",
"}, axis=\"columns\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>state</th>\n",
" <th>county</th>\n",
" <th>party</th>\n",
" <th>votes</th>\n",
" <th>total</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AL</td>\n",
" <td>Autauga</td>\n",
" <td>Democrat</td>\n",
" <td>6363</td>\n",
" <td>23932</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AL</td>\n",
" <td>Autauga</td>\n",
" <td>Republican</td>\n",
" <td>17379</td>\n",
" <td>23932</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AL</td>\n",
" <td>Autauga</td>\n",
" <td>Other</td>\n",
" <td>190</td>\n",
" <td>23932</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AL</td>\n",
" <td>Baldwin</td>\n",
" <td>Democrat</td>\n",
" <td>18424</td>\n",
" <td>85338</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AL</td>\n",
" <td>Baldwin</td>\n",
" <td>Republican</td>\n",
" <td>66016</td>\n",
" <td>85338</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" state county party votes total\n",
"0 AL Autauga Democrat 6363 23932\n",
"1 AL Autauga Republican 17379 23932\n",
"2 AL Autauga Other 190 23932\n",
"3 AL Baldwin Democrat 18424 85338\n",
"4 AL Baldwin Republican 66016 85338"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# We only want 2012--the latest election before the gb data was collected\n",
"pol = pol.query(\"`year` == 2012\") \\\n",
" .reset_index() \\\n",
" .drop([\n",
" \"year\", \"state\", \"county_fips\", \"office\",\n",
" \"candidate\", \"version\", \"mode\", \"index\",\n",
" ], axis=\"columns\") \\\n",
" .rename({\n",
" \"county_name\": \"county\",\n",
" \"state_po\": \"state\",\n",
" \"candidatevotes\": \"votes\",\n",
" \"totalvotes\": \"total\"\n",
" }, axis=\"columns\") \\\n",
" .apply(lambda x: x.str.capitalize() if x.name == \"county\" or x.name == \"party\" else x)\n",
"\n",
"pol.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Let's add long/lat columns to gb\n",
"gb = gb.merge(cords, left_on=\"GEOID10\", right_on=\"ZIP\") \\\n",
" .drop([\n",
" \"Mjoint_MF\", \"Mjoint_SS\", \"Mjoint_FF\", \"Mjoint_MM\",\n",
" \"Cns_TotHH\", \"Cns_UPSS\", \"Cns_UPFF\", \"Cns_UPMM\",\n",
" \"ParadeFlag\", \"FF_Tax\", \"FF_Cns\", \"MM_Tax\", \"MM_Cns\",\n",
" \"SS_Index_Weight\", \"Parade_Weight\", \"Bars_Weight\",\n",
" \"GEOID10\", \"ZIP\",\n",
" ], axis=\"columns\") \\\n",
" .rename({\n",
" \"LAT\": \"lat\",\n",
" \"LNG\": \"long\",\n",
" }, axis=\"columns\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Process/Wrangle the data"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Hancock OH</td>\n",
" <td>41.000471</td>\n",
" <td>-83.666033</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Stafford VA</td>\n",
" <td>38.413261</td>\n",
" <td>-77.451334</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Webster NE</td>\n",
" <td>40.180646</td>\n",
" <td>-98.498590</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Dimmit TX</td>\n",
" <td>28.423587</td>\n",
" <td>-99.765871</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Cedar IA</td>\n",
" <td>41.772360</td>\n",
" <td>-91.132610</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name lat long\n",
"0 Hancock OH 41.000471 -83.666033\n",
"1 Stafford VA 38.413261 -77.451334\n",
"2 Webster NE 40.180646 -98.498590\n",
"3 Dimmit TX 28.423587 -99.765871\n",
"4 Cedar IA 41.772360 -91.132610"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Combine the county name with the state code\n",
"def combine_name_state(row):\n",
" row[\"name\"] = f\"{row['name']} {row['STUSAB']}\"\n",
" return row\n",
"\n",
"counties = counties.apply(combine_name_state, axis=\"columns\") \\\n",
" .drop([\"STUSAB\"], axis=\"columns\")\n",
"\n",
"counties.to_csv(\"../data/processed/us-county-boundaries.csv\")\n",
"counties.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>county</th>\n",
" <th>party</th>\n",
" <th>votes</th>\n",
" <th>total</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>percent</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Autauga AL</td>\n",
" <td>Democrat</td>\n",
" <td>6363</td>\n",
" <td>23932</td>\n",
" <td>32.532237</td>\n",
" <td>-86.646439</td>\n",
" <td>0.265878</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Autauga AL</td>\n",
" <td>Republican</td>\n",
" <td>17379</td>\n",
" <td>23932</td>\n",
" <td>32.532237</td>\n",
" <td>-86.646439</td>\n",
" <td>0.726183</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Autauga AL</td>\n",
" <td>Other</td>\n",
" <td>190</td>\n",
" <td>23932</td>\n",
" <td>32.532237</td>\n",
" <td>-86.646439</td>\n",
" <td>0.007939</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Baldwin AL</td>\n",
" <td>Democrat</td>\n",
" <td>18424</td>\n",
" <td>85338</td>\n",
" <td>30.659218</td>\n",
" <td>-87.746067</td>\n",
" <td>0.215894</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Baldwin AL</td>\n",
" <td>Republican</td>\n",
" <td>66016</td>\n",
" <td>85338</td>\n",
" <td>30.659218</td>\n",
" <td>-87.746067</td>\n",
" <td>0.773583</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" county party votes total lat long percent\n",
"0 Autauga AL Democrat 6363 23932 32.532237 -86.646439 0.265878\n",
"1 Autauga AL Republican 17379 23932 32.532237 -86.646439 0.726183\n",
"2 Autauga AL Other 190 23932 32.532237 -86.646439 0.007939\n",
"3 Baldwin AL Democrat 18424 85338 30.659218 -87.746067 0.215894\n",
"4 Baldwin AL Republican 66016 85338 30.659218 -87.746067 0.773583"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Combine the county name with the state code\n",
"def combine_name_state(row):\n",
" row[\"county\"] = f\"{row['county']} {row['state']}\"\n",
" return row\n",
"\n",
"pol = pol.apply(combine_name_state, axis=\"columns\") \\\n",
" .merge(counties, left_on=\"county\", right_on=\"name\") \\\n",
" .drop([\"state\", \"name\"], axis=\"columns\") \\\n",
" .assign(percent=lambda x: x.votes/x.total)\n",
"\n",
"pol.to_csv(\"../data/processed/election-2012.csv\", index=False)\n",
"pol.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -596,7 +221,7 @@
"4 37.773134 -122.411167 " "4 37.773134 -122.411167 "
] ]
}, },
"execution_count": 8, "execution_count": 51,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -606,6 +231,115 @@
"gb.head()" "gb.head()"
] ]
}, },
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>county</th>\n",
" <th>party</th>\n",
" <th>votes</th>\n",
" <th>total</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>percent</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Autauga AL</td>\n",
" <td>Democrat</td>\n",
" <td>6363</td>\n",
" <td>23932</td>\n",
" <td>32.532237</td>\n",
" <td>-86.646439</td>\n",
" <td>0.265878</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Autauga AL</td>\n",
" <td>Republican</td>\n",
" <td>17379</td>\n",
" <td>23932</td>\n",
" <td>32.532237</td>\n",
" <td>-86.646439</td>\n",
" <td>0.726183</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Autauga AL</td>\n",
" <td>Other</td>\n",
" <td>190</td>\n",
" <td>23932</td>\n",
" <td>32.532237</td>\n",
" <td>-86.646439</td>\n",
" <td>0.007939</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Baldwin AL</td>\n",
" <td>Democrat</td>\n",
" <td>18424</td>\n",
" <td>85338</td>\n",
" <td>30.659218</td>\n",
" <td>-87.746067</td>\n",
" <td>0.215894</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Baldwin AL</td>\n",
" <td>Republican</td>\n",
" <td>66016</td>\n",
" <td>85338</td>\n",
" <td>30.659218</td>\n",
" <td>-87.746067</td>\n",
" <td>0.773583</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" county party votes total lat long percent\n",
"0 Autauga AL Democrat 6363 23932 32.532237 -86.646439 0.265878\n",
"1 Autauga AL Republican 17379 23932 32.532237 -86.646439 0.726183\n",
"2 Autauga AL Other 190 23932 32.532237 -86.646439 0.007939\n",
"3 Baldwin AL Democrat 18424 85338 30.659218 -87.746067 0.215894\n",
"4 Baldwin AL Republican 66016 85338 30.659218 -87.746067 0.773583"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pol.to_csv(\"../data/processed/election-2012.csv\")\n",
"pol.head()"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},

View File

@ -0,0 +1,69 @@
import pandas as pd
def load_and_process():
# cords - mapping zip codes to long/lat coordinates
cords = pd.read_csv("../data/raw/zip_lat_long.csv")
## counties - Relating US counties to their long/lat position on the Earth
# Combine the county name with the state code
def combine_name_state(row):
row["name"] = f"{row['name']} {row['STUSAB']}"
return row
counties = (
pd.read_csv("../data/raw/us-county-boundaries.csv", sep=";")
.rename({
"NAME": "name",
"INTPTLAT": "lat",
"INTPTLON": "long",
}, axis="columns")
.apply(combine_name_state, axis="columns")
.drop(["STUSAB"], axis="columns")
)
## pol - Election results from the 2012 American presidential election
def combine_name_state(row):
row["county"] = f"{row['county']} {row['state']}"
return row
pol = (
pd.read_csv("../data/raw/countypres_2000-2020.csv")
.query("`year` == 2012")
.reset_index()
.drop([
"year", "state", "county_fips", "office",
"candidate", "version", "mode", "index",
], axis="columns")
.rename({
"county_name": "county",
"state_po": "state",
"candidatevotes": "votes",
"totalvotes": "total"
}, axis="columns")
.apply(lambda x: x.str.capitalize() if x.name == "county" or x.name == "party" else x)
.apply(combine_name_state, axis="columns")
.merge(counties, left_on="county", right_on="name")
.drop(["state", "name"], axis="columns")
.assign(percent=lambda x: x.votes/x.total)
)
## gb - the gaybourhoods dataset
gb = (
pd.read_csv("../data/raw/gaybourhoods.csv")
.merge(cords, left_on="GEOID10", right_on="ZIP") \
.drop([
"Mjoint_MF", "Mjoint_SS", "Mjoint_FF", "Mjoint_MM",
"Cns_TotHH", "Cns_UPSS", "Cns_UPFF", "Cns_UPMM",
"ParadeFlag", "FF_Tax", "FF_Cns", "MM_Tax", "MM_Cns",
"SS_Index_Weight", "Parade_Weight", "Bars_Weight",
"GEOID10", "ZIP",
], axis="columns") \
.rename({
"LAT": "lat",
"LNG": "long",
}, axis="columns")
)
return (gb, pol, counties, cords)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff