Module dp_policy.titlei.evaluation
Expand source code Browse git
import os
import pandas as pd
import geopandas as gpd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import matplotlib.colors as pltc
import seaborn as sns
import pickle
from dp_policy.titlei.utils import get_acs_unified
import dp_policy.config as config
from typing import Dict, Callable, Union
def get_geography() -> pd.DataFrame:
"""Load shapefiles for LEAs.
Returns:
pd.DataFrame: Shapefiles indexed by school district.
"""
geo = gpd.read_file(os.path.join(
config.root,
"data/shapefiles/school_districts_19/schooldistrict_sy1819_tl19.shp"
))
geo.STATEFP = geo.STATEFP.astype(int)
geo["District ID"] = np.where(
geo.UNSDLEA.notna(),
geo.UNSDLEA,
np.where(
geo.SCSDLEA.notna(),
geo.SCSDLEA,
geo.ELSDLEA
)
)
geo["District ID"] = geo["District ID"].astype(int)
geo = geo.rename(columns={
"STATEFP": "State FIPS Code"
}).set_index(["State FIPS Code", "District ID"])
return geo
def discrimination_join(
results: pd.DataFrame,
save_path: str = None,
verbose: bool = False,
include_moes: bool = False
) -> pd.DataFrame:
"""Join results to demographic covariates.
Args:
results (pd.DataFrame): Results, keyed by LEA.
save_path (str, optional): Where to save the joined dataframe.
Defaults to None.
verbose (bool, optional): Defaults to False.
include_moes (bool, optional): Whether to include moes with percents.
Defaults to False.
Returns:
pd.DataFrame: Joined dataframe.
"""
acs = get_acs_unified(verbose)
variables = [
'Total population (RACE) - est',
]
# add race variables
variables += [
r for r in acs.columns
if (
r.endswith("(RACE) - pct")
or (include_moes and r.endswith("(RACE) - pctmoe"))
)
and "and" not in r
and "races" not in r
and not r.startswith("One race")
] + ["Two or more races (RACE) - pct"]
if verbose and include_moes:
print("Including MOEs")
# add ethnicity variables
hisp = [
r for r in acs.columns
if r.endswith("(HISPANIC OR LATINO AND RACE) - pct")
]
variables += hisp[1:6]
# add income variables
variables += [
r for r in acs.columns
if r.startswith("Median household income (dollars) (")
]
# add rural/urban - need a 3rd data source
# add immigrant status
variables += [
# "Foreign born (PLACE OF BIRTH) - est",
"Foreign born (PLACE OF BIRTH) - pct",
# "Not a U.S. citizen (U.S. CITIZENSHIP STATUS) - est",
"Not a U.S. citizen (U.S. CITIZENSHIP STATUS) - pct"
]
# add language isolation
variables += [
# 'Language other than English (LANGUAGE SPOKEN AT HOME) - est',
'Language other than English (LANGUAGE SPOKEN AT HOME) - pct'
]
# add renters vs. homeowners (housing security)
variables += [
# 'Renter-occupied (HOUSING TENURE) - est',
'Renter-occupied (HOUSING TENURE) - pct',
'Average household size of renter-occupied unit (HOUSING TENURE) - est'
]
# look in these columns for '-' and replace with nan according to ACS docs
# https://www.census.gov/data/developers/data-sets/acs-1year/notes-on-acs-estimate-and-annotation-values.html
# otherwise convert to numeric
acs_vars = acs[variables]\
.replace(['-', "**", "***", "(X)", "N", "null"], np.nan)\
.replace('250,000+', 250000)\
.apply(pd.to_numeric, errors='raise')
if verbose:
print(variables)
print(acs_vars.shape)
print(results.shape)
# adding geographic area
geo = get_geography()
to_join = acs_vars.join(geo["ALAND"], how="inner")
if verbose:
print("ACS", acs_vars.shape)
print("Geo joined ACS", to_join.shape)
print(
to_join[
to_join["Total population (RACE) - est"].isna()
].groupby(["State FIPS Code", "District ID"]).groups.keys()
)
print("Joining ACS/geo variables...")
grants = results.join(to_join, how="inner")
if verbose:
print(
"missing some districts in ACS/geo:",
len(results.groupby([
"State FIPS Code", "District ID"
])) - len(grants.groupby([
"State FIPS Code", "District ID"
]))
)
print(grants.shape)
if save_path:
print("Saving to feather...")
grants.reset_index().to_feather(f"{save_path}.feather")
print("... saved.")
return grants
def discrimination_treatments_join(
treatments_name: dict,
exclude: list = [],
epsilon: float = 0.1,
delta: float = 0.0,
**join_kwargs
) -> pd.DataFrame:
"""Join results to demographic covariates.
Args:
treatments_name (dict): Treatment results keyed by treatment name.
exclude (list, optional): Treatments to exclude. Defaults to [].
epsilon (float, optional): Epsilon to use (one allowed). Defaults to
0.1.
delta (float, optional): Delta to use (one allowed). Defaults to 0.0.
Returns:
pd.DataFrame: Combined dataframe for all treatments.
"""
# output a concatenated DF with a new index column indicating which
# treatment was applied
treatments = {
treatment: df.loc[(
slice(None),
delta if delta is not None else slice(None),
epsilon if epsilon is not None else slice(None),
slice(None),
slice(None)
), [
c for c in df.columns
if c.endswith("- pct")
or c.endswith("- est")
or c.startswith("true")
or c.endswith("grant_total")
or c in [
"ALAND"
]
]]
for treatment, df in load_treatments(treatments_name).items()
if treatment not in exclude
}
print("Saving", treatments.keys())
joined = pd.concat(
treatments,
names=['treatment']
)
print("Concatenated has {} rows and {} treatments".format(
len(joined),
len(treatments)
))
discrimination_joined = discrimination_join(
joined,
save_path=f"{config.root}/results/policy_experiments/"
f"{treatments_name}_discrimination_laplace",
**join_kwargs
)
return discrimination_joined
def percentile(n):
def percentile_(x):
return x.quantile(n)
percentile_.__name__ = 'percentile_{}'.format(n*100)
return percentile_
def geo_join(results: pd.DataFrame) -> pd.DataFrame:
"""Join results wwith shapefiles.
Args:
results (pd.DataFrame): Results, keyed by LEA.
Returns:
pd.DataFrame: Results joined with LEA shapefile and geographic data.
"""
# NOTE: only accepts "State FIPS Code", "District ID", "trial" in index
results = results.copy()
results["error"] = results.est_grant_total - results.true_grant_total
results["error_per_child"] = results.error / results['true_children_total']
results["error_per_child_eligible"] = \
results.error / results['true_children_eligible']
results["error_dp"] = results.dpest_grant_total - results.true_grant_total
results["error_dp_per_child"] = \
results.error_dp / results['true_children_total']
results["error_dp_per_child_eligible"] = \
results.error_dp / results['true_children_eligible']
results["percent_eligible"] = \
results["true_children_eligible"] / results["true_children_total"]
results["switched_eligibility"] = (
~(
(results.est_eligible_targeted == results.true_eligible_targeted)
& (results.est_eligible_basic == results.true_eligible_basic)
& (
results.est_eligible_concentration ==
results.true_eligible_concentration
)
)
).astype(int)
results["became_eligible"] = (
(
results.est_eligible_targeted.astype(bool)
& ~results.true_eligible_targeted.astype(bool)
)
| (
results.est_eligible_basic.astype(bool)
& ~results.true_eligible_basic.astype(bool)
)
| (
results.est_eligible_concentration.astype(bool)
& ~results.true_eligible_concentration.astype(bool)
)
).astype(int)
results["became_ineligible"] = (
(
~results.est_eligible_targeted.astype(bool)
& results.true_eligible_targeted.astype(bool)
)
| (
~results.est_eligible_basic.astype(bool)
& results.true_eligible_basic.astype(bool)
)
| (
~results.est_eligible_concentration.astype(bool)
& results.true_eligible_concentration.astype(bool)
)
).astype(int)
results["switched_eligibility_dp"] = (
~(
(results.dpest_eligible_targeted == results.true_eligible_targeted)
& (results.dpest_eligible_basic == results.true_eligible_basic)
& (
results.dpest_eligible_concentration ==
results.true_eligible_concentration
)
)
).astype(int)
results["became_eligible_dp"] = (
(
results.dpest_eligible_targeted.astype(bool)
& ~results.true_eligible_targeted.astype(bool)
)
| (
results.dpest_eligible_basic.astype(bool)
& ~results.true_eligible_basic.astype(bool)
)
| (
results.dpest_eligible_concentration.astype(bool)
& ~results.true_eligible_concentration.astype(bool)
)
).astype(int)
results["became_ineligible_dp"] = (
(
~results.dpest_eligible_targeted.astype(bool)
& results.true_eligible_targeted.astype(bool)
)
| (
~results.dpest_eligible_basic.astype(bool)
& results.true_eligible_basic.astype(bool)
)
| (
~results.dpest_eligible_concentration.astype(bool)
& results.true_eligible_concentration.astype(bool)
)
).astype(int)
results["dp_marginal"] = \
results["error_dp_per_child_eligible"] -\
results["error_per_child_eligible"]
geo = get_geography()
joined = geo.join(
results[[
"error",
"error_per_child",
"error_per_child_eligible",
"error_dp",
"error_dp_per_child",
"error_dp_per_child_eligible",
"true_children_eligible",
"true_pop_total",
"percent_eligible",
"true_grant_total",
"switched_eligibility",
"became_eligible",
"became_ineligible",
"switched_eligibility_dp",
"became_eligible_dp",
"became_ineligible_dp",
"dp_marginal"
]]
.groupby(["State FIPS Code", "District ID"])
.agg(['mean', 'std', 'sem', percentile(0.05)]),
how="inner"
)
joined.columns = [
col if isinstance(col, str) else '_'.join([
c for c in col if c != 'mean'
]).rstrip('_')
for col in joined.columns.values
]
for col in [
"error_per_child",
"error_per_child_eligible",
"error_dp_per_child",
"error_dp_per_child_eligible"
]:
joined.loc[
np.isinf(joined[col]), col
] = np.nan
return joined
def plot_treatments(
treatments: Dict[str, pd.DataFrame],
x_func: Callable,
plot_method: Callable,
plot_kwargs: dict,
filename: str = None,
xlab: str = None,
ylab: str = "Smoothed density",
grant: str = "total",
epsilon: float = None,
delta: float = None,
mean_line: bool = False
):
"""Plot treatment results.
Args:
treatments (Dict[str, pd.DataFrame]): Treatment results mapped by
treatment name.
x_func (Callable): Function to transform dataframe before plotting.
plot_method (Callable): Function for plotting.
plot_kwargs (dict): Parameters for plotting function.
filename (str, optional): Filename of plot. Defaults to None.
xlab (str, optional): X label for plot. Defaults to None.
ylab (str, optional): Y label for plot. Defaults to "Smoothed density".
grant (str, optional): Grant type to plot. Defaults to "total".
epsilon (float, optional): Epsilon to plot. Defaults to None.
delta (float, optional): Delta to plot. Defaults to None.
mean_line (bool, optional): Whether to include mean line. Defaults to
False.
"""
palette = sns.color_palette(n_colors=len(treatments))
for i, (treatment, df_raw) in enumerate(treatments.items()):
df = df_raw.loc[pd.IndexSlice[
:,
delta if delta is not None else slice(None),
epsilon if epsilon is not None else slice(None),
:,
:
], :].copy()
df.loc[:, "misalloc"] = \
df[f"dpest_grant_{grant}"] - df[f"true_grant_{grant}"]
df.loc[:, "misalloc_sq"] = np.power(df["misalloc"], 2)
if grant == "total":
df["lost_eligibility"] = \
(
df["dpest_eligible_basic"].astype(bool) &
~df["true_eligible_basic"].astype(bool)
) |\
(
df["dpest_eligible_concentration"].astype(bool) &
~df["true_eligible_concentration"].astype(bool)
) |\
(
df["dpest_eligible_targeted"].astype(bool) &
~df["true_eligible_targeted"].astype(bool)
)
else:
df["lost_eligibility"] = \
~df["dpest_eligible_{}".format(grant)].astype(bool) \
& df["true_eligible_{}".format(grant)].astype(bool)
plot_kwargs.update({
'label': treatment,
'color': palette[i]
})
x = x_func(df)
plot_method(x, **plot_kwargs)
if mean_line:
plt.axvline(
x.mean(), color=palette[i],
linestyle='dashed'
)
plt.xlabel(xlab)
plt.ylabel(ylab)
plt.legend(loc='upper right')
if filename:
plt.savefig(
f"{config.root}/plots/bootstrap/{filename}.pdf",
transparent=True,
bbox_inches='tight'
)
plt.show()
plt.close()
def cube(x):
return np.sign(x)*np.power(np.abs(x), 1/3)
def heatmap(
data: pd.DataFrame,
label: str = None,
title: str = None,
transform: str = 'cube',
theme: str = "seismic_r",
y: str = "error_dp_per_child",
vcenter: int = 0,
file: str = None,
figsize: tuple = (10, 5),
bar_location: str = 'bottom',
min: int = None,
max: int = None,
dpi: int = 300,
alpha: float = 0.1
):
"""Plot heatmap of results.
Args:
data (pd.DataFrame): Dataframe to plot from.
label (str, optional): Label for colorbar. Defaults to None.
title (str, optional): Title for plot. Defaults to None.
transform (str, optional): Transformation for data. Defaults to 'cube'.
theme (str, optional): Theme for colorbar. Defaults to "seismic_r".
y (str, optional): Column to plot. Defaults to "error_dp_per_child".
vcenter (int, optional): Center of colorbar. Defaults to 0.
file (str, optional): Filename. Defaults to None.
figsize (tuple, optional): Figure size. Defaults to (10, 5).
bar_location (str, optional): Where to place the colorbar. Defaults to
'bottom'.
min (int, optional): Minimum for colorbar. Defaults to None.
max (int, optional): Maximum for colorbar. Defaults to None.
dpi (int, optional): Figure DPI. Defaults to 300.
alpha (float, optional): Confidence level for t-test. Defaults to 0.1.
"""
if alpha is not None:
data[f"{y}_moe"] = data.loc[:, f"{y}_sem"] * \
stats.norm.ppf(1 - alpha / 2)
sig = ~(
((data[y] + data[f"{y}_moe"]) >= 0) &
((data[y] - data[f"{y}_moe"]) <= 0)
)
print(
"All but",
len(data)-sig.sum(),
f"are significantly different from zero at {alpha}"
)
fig, ax = plt.subplots(1, figsize=figsize, dpi=dpi)
for key in [y, f"{y}_moe"] if alpha is not None else [y]:
if transform == 'cube':
data.loc[:, key] = cube(data[key])
if transform == 'log':
data.loc[:, key] = np.where(data[key] == 0, 0, np.log(data[key]))
if transform == 'sqrt':
data.loc[:, key] = np.sign(data[key]) * np.sqrt(np.abs(data[key]))
# Create colorbar as a legend
if min is None:
min = data[y].min()
if max is None:
max = data[y].max()
bound = np.max(np.abs([min, max]))
if vcenter is not None and transform != 'log':
norm = pltc.TwoSlopeNorm(vcenter=0, vmin=-bound, vmax=bound)
else:
norm = pltc.Normalize(vmin=min, vmax=max)
sm = plt.cm.ScalarMappable(cmap=theme, norm=norm)
if alpha is not None:
print(
f"None of the {(1-alpha)*100}% MOEs exceeds",
data[f"{y}_moe"].abs().max()
)
data[f"{y}_sig"] = np.where(sig, data[y], np.nan)
data.plot(
column=f"{y}_sig" if alpha is not None else y,
cmap=theme,
norm=norm,
ax=ax,
linewidth=0.05,
edgecolor='0.1',
missing_kwds=dict(
hatch='///',
edgecolor=(0, 0, 0, 0.25),
facecolor='none',
label=f"Not significant (p < {alpha})"
),
rasterized=True
)
cb = fig.colorbar(
sm,
location=bar_location,
shrink=0.5, pad=0.05, aspect=30
)
cb.set_label(label)
if title is not None:
plt.title(title)
plt.axis('off')
plt.tight_layout()
if file is not None:
plt.savefig(
f"{config.root}/plots/geo/{file}",
bbox_inches='tight',
transparent=True,
dpi=dpi
)
# plt.savefig(
# f"{config.root}/plots/geo/{file}_large",
# dpi=dpi, bbox_inches='tight',
# transparent=True
# )
plt.close()
else:
plt.show()
def save_treatments(
treatments: Dict[str, pd.DataFrame],
experiment_name: str
):
"""Save treatment results.
Args:
treatments (Dict[str, pd.DataFrame]): Dictionary of treatment names
mapped to results.
experiment_name (str): Name for this collection of treatments.
"""
# minify treatments
treatments = {
treatment: df.loc[:, [
c for c in df.columns
if c.endswith("- pct")
or c.endswith("- est")
or c.startswith("true")
or "eligible" in c
or "children_total" in c
or "grant" in c
or c in [
"ALAND"
]
]]
for treatment, df in treatments.items()
}
pickle.dump(
treatments,
open(
f"{config.root}/results/policy_experiments/{experiment_name}.pkl",
'wb'
)
)
def load_treatments(
experiment_name: str,
treatment_name: str = None
) -> Union[Dict[str, pd.DataFrame], pd.DataFrame]:
"""Load treatment results from memory.
Args:
experiment_name (str): Experiment to load.
treatment_name (str, optional): Name of treatment to load. If not
specified, loads all treatments. Defaults to None.
Returns:
Dict[str, pd.DataFrame]: Dictionary of treatments mapped to results,
or individual set of results if `treatment_name` is specified.
"""
treatments = pickle.load(
open(
f"{config.root}/results/policy_experiments/{experiment_name}.pkl",
'rb'
)
)
if treatment_name is None:
return treatments
return treatments[treatment_name]
def compare_treatments(
treatments: Dict[str, pd.DataFrame],
epsilon: float = 0.1,
delta: float = 0.0,
mapvar: str = "error_per_child",
experiment_name: str = None
):
"""Compare treatment results.
Args:
treatments (Dict[str, pd.DataFrame]): Treatment results mapped by
treatment names.
epsilon (float, optional): Epsilon to use. Defaults to 0.1.
delta (float, optional): Delta to use. Defaults to 0.0.
mapvar (str, optional): Which variable to plot. Defaults to
"error_per_child".
experiment_name (str, optional): Name of the experiment. Defaults to
None.
"""
if epsilon is None:
print(
"[WARN] Epsilon is none "
"- only use this if there is only one eps value in the df."
)
else:
print("Comparing at eps=", epsilon)
for treatment, df in treatments.items():
df = df.loc[pd.IndexSlice[
:,
delta if delta is not None else slice(None),
epsilon if epsilon is not None else slice(None),
:,
:
], :].copy()
treatments[treatment] = df
print(len(df))
print("\n#", treatment)
print("True budget:", df[f"true_grant_total"].sum())
print("DP est budget:", df[f"dpest_grant_total"].sum())
df["became_ineligible"] = \
(
~df.dpest_eligible_targeted.astype(bool)
& df.true_eligible_targeted.astype(bool)
)\
| (
~df.dpest_eligible_basic.astype(bool)
& df.true_eligible_basic.astype(bool)
)\
| (
~df.dpest_eligible_concentration.astype(bool)
& df.true_eligible_concentration.astype(bool)
)
print(
"Avg prop. districts erroneously ineligible:",
df.became_ineligible.groupby("trial").sum().mean()
)
print("# est")
misalloc_statistics(
df.est_grant_total - df.true_grant_total
)
print("# dpest")
misalloc_statistics(
df.dpest_grant_total - df.true_grant_total
)
print("# marginal")
misalloc_statistics(
df.dpest_grant_total - df.est_grant_total
)
# compare bias
plot_treatments(
treatments,
lambda df: np.sqrt(df.groupby('trial')["misalloc"].mean()),
sns.kdeplot,
dict(bw_method=0.5, fill=True),
epsilon=epsilon,
delta=delta,
xlab=f"Mean misalloc (per trial)",
mean_line=True
)
# compare RMSE - mean and hist
plot_treatments(
treatments,
lambda df: np.sqrt(df.groupby('trial')["misalloc_sq"].mean()),
sns.kdeplot,
dict(bw_method=0.5, fill=True),
filename=f"{experiment_name}_rmse",
epsilon=epsilon,
delta=delta,
xlab=f"RMSE (per trial)",
mean_line=True
)
# compare likelihood of ineligibility
plot_treatments(
treatments,
lambda df:
df.groupby(['State FIPS Code', 'District ID'])["lost_eligibility"]
.mean(),
sns.kdeplot,
dict(bw_method=0.5, fill=True),
# filename=f"likelihood_ineligible_total",
xlab=f"Likelihood of losing eligibility",
epsilon=epsilon,
delta=delta,
mean_line=True
)
# compare nationwide map
print("Plotting", mapvar)
ymins = []
ymaxs = []
treatments_geo = {
treatment:
geo_join(
df.loc[pd.IndexSlice[
:,
delta if delta is not None else slice(None),
epsilon if epsilon is not None else slice(None),
:,
:
]]
)
for treatment, df in treatments.items()
}
for treatment, df in treatments_geo.items():
err = cube(df.loc[[
f for f in df.index.get_level_values("State FIPS Code").unique()
if f not in [2, 15]
]]["error_per_child"])
ymins.append(err.min())
ymaxs.append(err.max())
ymin = np.min(ymins)
ymax = np.max(ymaxs)
for treatment, df in treatments_geo.items():
heatmap(
df.loc[[
f
for f in df.index.get_level_values("State FIPS Code").unique()
if f not in [2, 15]
]],
y="error_per_child_eligible",
label="Misallocation per eligible child (cube root)",
title=treatment,
file=f"{experiment_name}_{treatment}.pdf",
figsize=(15, 10),
bar_location='right',
min=ymin,
max=ymax,
dpi=50
)
def match_true(
df_true: pd.DataFrame,
dfs_to_match: pd.DataFrame
):
"""Equalize result baselines (columns with "true").
Args:
df_true (pd.DataFrame): Dataframe with ground truth.
dfs_to_match (pd.DataFrame): Dataframes to equalize to `df_true`.
"""
for c in (c for c in df_true.columns if "true" in c):
for df in dfs_to_match:
df.loc[:, c] = df_true.loc[:, c]
def misalloc_statistics(
error: pd.Series,
allocations: pd.DataFrame = None,
grant_type: str = None
):
"""Print statistics describing misallocation in a set of simulations.
Args:
error (pd.Series): Error in allocation indexed by district.
allocations (pd.DataFrame, optional): Full set of allocations.
Defaults to None.
grant_type (str, optional): Optionally, a grant type to print specific
statistics for. Defaults to None.
"""
err_grouped = error.groupby(
["State FIPS Code", "District ID"]
)
exp_error = err_grouped.mean()
print(f"# rows: {len(error)}")
print(f"Max error: {np.abs(error).max()}")
print("-- RMSE --")
print(f"RMSE:", np.sqrt(np.mean(error**2)))
print(
"Avg. RMSE",
np.mean(
np.sqrt((error**2).groupby('trial').mean())
)
)
print(
f"RMSE in exp. error:",
np.sqrt(np.mean(exp_error**2))
)
print("-- Losses --")
print(
"Avg. (per trial) # of districts losing $$:",
(error < 0).groupby("trial").sum().mean()
)
print(
f"Avg. total losses:",
error[
error < 0
].abs().groupby("trial").sum().mean()
)
print(
f"Std. total losses:",
error[
error < 0
].abs().groupby("trial").sum().std()
)
print(
"Total exp losses:",
exp_error[exp_error < 0].abs().sum()
)
print(
"SD in total. exp. losses",
np.sqrt(error.groupby(
["State FIPS Code", "District ID"]
).std()[exp_error < 0].pow(2).sum())
)
print(
"Average exp loss",
exp_error[exp_error < 0].abs().mean()
)
lowerr = err_grouped.quantile(0.05)
print(
"Total 5% quantile losses:",
lowerr[lowerr < 0].abs().sum()
)
print(
"Avg. 5% quantile loss:",
lowerr[lowerr < 0].mean()
)
print("-- Misalloc --")
print(
f"Avg. total abs misalloc:",
error.abs().groupby("trial").sum().mean()
)
print(
f"Total exp abs misalloc:",
exp_error.abs().sum()
)
if allocations is not None:
print("-- Other stats --")
small_district = allocations["true_pop_total"]\
.groupby(["State FIPS Code", "District ID"])\
.first() < 20000
print(
"# small districts:",
small_district.sum()
)
print(
"Total exp misalloc to large districts:",
exp_error[~small_district].abs().sum()
)
print(
"Total exp misalloc to small districts:",
exp_error[small_district].abs().sum()
)
if grant_type is not None:
print(
"Total true alloc:",
allocations[f"true_grant_{grant_type}"]
.groupby(["State FIPS Code", "District ID"])
.first().abs().sum()
)
print(
"Total true alloc per child eligible",
allocations[f"true_grant_{grant_type}"]
.groupby(["State FIPS Code", "District ID"])
.first().sum() / allocations[f"true_children_eligible"]
.groupby(["State FIPS Code", "District ID"])
.first().sum()
)
print("Average true alloc: {}".format(
allocations[f"true_grant_{grant_type}"].mean()
))
print(
"Average true alloc per child eligible",
(
allocations[f"true_grant_{grant_type}"] /
allocations["true_children_eligible"]
).mean()
)
print("Max true alloc: {}".format(
allocations[f"true_grant_{grant_type}"].max()
))
Functions
def compare_treatments(treatments: Dict[str, pandas.core.frame.DataFrame], epsilon: float = 0.1, delta: float = 0.0, mapvar: str = 'error_per_child', experiment_name: str = None)
-
Compare treatment results.
Args
treatments
:Dict[str, pd.DataFrame]
- Treatment results mapped by treatment names.
epsilon
:float
, optional- Epsilon to use. Defaults to 0.1.
delta
:float
, optional- Delta to use. Defaults to 0.0.
mapvar
:str
, optional- Which variable to plot. Defaults to "error_per_child".
experiment_name
:str
, optional- Name of the experiment. Defaults to None.
Expand source code Browse git
def compare_treatments( treatments: Dict[str, pd.DataFrame], epsilon: float = 0.1, delta: float = 0.0, mapvar: str = "error_per_child", experiment_name: str = None ): """Compare treatment results. Args: treatments (Dict[str, pd.DataFrame]): Treatment results mapped by treatment names. epsilon (float, optional): Epsilon to use. Defaults to 0.1. delta (float, optional): Delta to use. Defaults to 0.0. mapvar (str, optional): Which variable to plot. Defaults to "error_per_child". experiment_name (str, optional): Name of the experiment. Defaults to None. """ if epsilon is None: print( "[WARN] Epsilon is none " "- only use this if there is only one eps value in the df." ) else: print("Comparing at eps=", epsilon) for treatment, df in treatments.items(): df = df.loc[pd.IndexSlice[ :, delta if delta is not None else slice(None), epsilon if epsilon is not None else slice(None), :, : ], :].copy() treatments[treatment] = df print(len(df)) print("\n#", treatment) print("True budget:", df[f"true_grant_total"].sum()) print("DP est budget:", df[f"dpest_grant_total"].sum()) df["became_ineligible"] = \ ( ~df.dpest_eligible_targeted.astype(bool) & df.true_eligible_targeted.astype(bool) )\ | ( ~df.dpest_eligible_basic.astype(bool) & df.true_eligible_basic.astype(bool) )\ | ( ~df.dpest_eligible_concentration.astype(bool) & df.true_eligible_concentration.astype(bool) ) print( "Avg prop. districts erroneously ineligible:", df.became_ineligible.groupby("trial").sum().mean() ) print("# est") misalloc_statistics( df.est_grant_total - df.true_grant_total ) print("# dpest") misalloc_statistics( df.dpest_grant_total - df.true_grant_total ) print("# marginal") misalloc_statistics( df.dpest_grant_total - df.est_grant_total ) # compare bias plot_treatments( treatments, lambda df: np.sqrt(df.groupby('trial')["misalloc"].mean()), sns.kdeplot, dict(bw_method=0.5, fill=True), epsilon=epsilon, delta=delta, xlab=f"Mean misalloc (per trial)", mean_line=True ) # compare RMSE - mean and hist plot_treatments( treatments, lambda df: np.sqrt(df.groupby('trial')["misalloc_sq"].mean()), sns.kdeplot, dict(bw_method=0.5, fill=True), filename=f"{experiment_name}_rmse", epsilon=epsilon, delta=delta, xlab=f"RMSE (per trial)", mean_line=True ) # compare likelihood of ineligibility plot_treatments( treatments, lambda df: df.groupby(['State FIPS Code', 'District ID'])["lost_eligibility"] .mean(), sns.kdeplot, dict(bw_method=0.5, fill=True), # filename=f"likelihood_ineligible_total", xlab=f"Likelihood of losing eligibility", epsilon=epsilon, delta=delta, mean_line=True ) # compare nationwide map print("Plotting", mapvar) ymins = [] ymaxs = [] treatments_geo = { treatment: geo_join( df.loc[pd.IndexSlice[ :, delta if delta is not None else slice(None), epsilon if epsilon is not None else slice(None), :, : ]] ) for treatment, df in treatments.items() } for treatment, df in treatments_geo.items(): err = cube(df.loc[[ f for f in df.index.get_level_values("State FIPS Code").unique() if f not in [2, 15] ]]["error_per_child"]) ymins.append(err.min()) ymaxs.append(err.max()) ymin = np.min(ymins) ymax = np.max(ymaxs) for treatment, df in treatments_geo.items(): heatmap( df.loc[[ f for f in df.index.get_level_values("State FIPS Code").unique() if f not in [2, 15] ]], y="error_per_child_eligible", label="Misallocation per eligible child (cube root)", title=treatment, file=f"{experiment_name}_{treatment}.pdf", figsize=(15, 10), bar_location='right', min=ymin, max=ymax, dpi=50 )
def cube(x)
-
Expand source code Browse git
def cube(x): return np.sign(x)*np.power(np.abs(x), 1/3)
def discrimination_join(results: pandas.core.frame.DataFrame, save_path: str = None, verbose: bool = False, include_moes: bool = False) ‑> pandas.core.frame.DataFrame
-
Join results to demographic covariates.
Args
results
:pd.DataFrame
- Results, keyed by LEA.
save_path
:str
, optional- Where to save the joined dataframe. Defaults to None.
verbose
:bool
, optional- Defaults to False.
include_moes
:bool
, optional- Whether to include moes with percents. Defaults to False.
Returns
pd.DataFrame
- Joined dataframe.
Expand source code Browse git
def discrimination_join( results: pd.DataFrame, save_path: str = None, verbose: bool = False, include_moes: bool = False ) -> pd.DataFrame: """Join results to demographic covariates. Args: results (pd.DataFrame): Results, keyed by LEA. save_path (str, optional): Where to save the joined dataframe. Defaults to None. verbose (bool, optional): Defaults to False. include_moes (bool, optional): Whether to include moes with percents. Defaults to False. Returns: pd.DataFrame: Joined dataframe. """ acs = get_acs_unified(verbose) variables = [ 'Total population (RACE) - est', ] # add race variables variables += [ r for r in acs.columns if ( r.endswith("(RACE) - pct") or (include_moes and r.endswith("(RACE) - pctmoe")) ) and "and" not in r and "races" not in r and not r.startswith("One race") ] + ["Two or more races (RACE) - pct"] if verbose and include_moes: print("Including MOEs") # add ethnicity variables hisp = [ r for r in acs.columns if r.endswith("(HISPANIC OR LATINO AND RACE) - pct") ] variables += hisp[1:6] # add income variables variables += [ r for r in acs.columns if r.startswith("Median household income (dollars) (") ] # add rural/urban - need a 3rd data source # add immigrant status variables += [ # "Foreign born (PLACE OF BIRTH) - est", "Foreign born (PLACE OF BIRTH) - pct", # "Not a U.S. citizen (U.S. CITIZENSHIP STATUS) - est", "Not a U.S. citizen (U.S. CITIZENSHIP STATUS) - pct" ] # add language isolation variables += [ # 'Language other than English (LANGUAGE SPOKEN AT HOME) - est', 'Language other than English (LANGUAGE SPOKEN AT HOME) - pct' ] # add renters vs. homeowners (housing security) variables += [ # 'Renter-occupied (HOUSING TENURE) - est', 'Renter-occupied (HOUSING TENURE) - pct', 'Average household size of renter-occupied unit (HOUSING TENURE) - est' ] # look in these columns for '-' and replace with nan according to ACS docs # https://www.census.gov/data/developers/data-sets/acs-1year/notes-on-acs-estimate-and-annotation-values.html # otherwise convert to numeric acs_vars = acs[variables]\ .replace(['-', "**", "***", "(X)", "N", "null"], np.nan)\ .replace('250,000+', 250000)\ .apply(pd.to_numeric, errors='raise') if verbose: print(variables) print(acs_vars.shape) print(results.shape) # adding geographic area geo = get_geography() to_join = acs_vars.join(geo["ALAND"], how="inner") if verbose: print("ACS", acs_vars.shape) print("Geo joined ACS", to_join.shape) print( to_join[ to_join["Total population (RACE) - est"].isna() ].groupby(["State FIPS Code", "District ID"]).groups.keys() ) print("Joining ACS/geo variables...") grants = results.join(to_join, how="inner") if verbose: print( "missing some districts in ACS/geo:", len(results.groupby([ "State FIPS Code", "District ID" ])) - len(grants.groupby([ "State FIPS Code", "District ID" ])) ) print(grants.shape) if save_path: print("Saving to feather...") grants.reset_index().to_feather(f"{save_path}.feather") print("... saved.") return grants
def discrimination_treatments_join(treatments_name: dict, exclude: list = [], epsilon: float = 0.1, delta: float = 0.0, **join_kwargs) ‑> pandas.core.frame.DataFrame
-
Join results to demographic covariates.
Args
treatments_name
:dict
- Treatment results keyed by treatment name.
exclude
:list
, optional- Treatments to exclude. Defaults to [].
epsilon
:float
, optional- Epsilon to use (one allowed). Defaults to 0.1.
delta
:float
, optional- Delta to use (one allowed). Defaults to 0.0.
Returns
pd.DataFrame
- Combined dataframe for all treatments.
Expand source code Browse git
def discrimination_treatments_join( treatments_name: dict, exclude: list = [], epsilon: float = 0.1, delta: float = 0.0, **join_kwargs ) -> pd.DataFrame: """Join results to demographic covariates. Args: treatments_name (dict): Treatment results keyed by treatment name. exclude (list, optional): Treatments to exclude. Defaults to []. epsilon (float, optional): Epsilon to use (one allowed). Defaults to 0.1. delta (float, optional): Delta to use (one allowed). Defaults to 0.0. Returns: pd.DataFrame: Combined dataframe for all treatments. """ # output a concatenated DF with a new index column indicating which # treatment was applied treatments = { treatment: df.loc[( slice(None), delta if delta is not None else slice(None), epsilon if epsilon is not None else slice(None), slice(None), slice(None) ), [ c for c in df.columns if c.endswith("- pct") or c.endswith("- est") or c.startswith("true") or c.endswith("grant_total") or c in [ "ALAND" ] ]] for treatment, df in load_treatments(treatments_name).items() if treatment not in exclude } print("Saving", treatments.keys()) joined = pd.concat( treatments, names=['treatment'] ) print("Concatenated has {} rows and {} treatments".format( len(joined), len(treatments) )) discrimination_joined = discrimination_join( joined, save_path=f"{config.root}/results/policy_experiments/" f"{treatments_name}_discrimination_laplace", **join_kwargs ) return discrimination_joined
def geo_join(results: pandas.core.frame.DataFrame) ‑> pandas.core.frame.DataFrame
-
Join results wwith shapefiles.
Args
results
:pd.DataFrame
- Results, keyed by LEA.
Returns
pd.DataFrame
- Results joined with LEA shapefile and geographic data.
Expand source code Browse git
def geo_join(results: pd.DataFrame) -> pd.DataFrame: """Join results wwith shapefiles. Args: results (pd.DataFrame): Results, keyed by LEA. Returns: pd.DataFrame: Results joined with LEA shapefile and geographic data. """ # NOTE: only accepts "State FIPS Code", "District ID", "trial" in index results = results.copy() results["error"] = results.est_grant_total - results.true_grant_total results["error_per_child"] = results.error / results['true_children_total'] results["error_per_child_eligible"] = \ results.error / results['true_children_eligible'] results["error_dp"] = results.dpest_grant_total - results.true_grant_total results["error_dp_per_child"] = \ results.error_dp / results['true_children_total'] results["error_dp_per_child_eligible"] = \ results.error_dp / results['true_children_eligible'] results["percent_eligible"] = \ results["true_children_eligible"] / results["true_children_total"] results["switched_eligibility"] = ( ~( (results.est_eligible_targeted == results.true_eligible_targeted) & (results.est_eligible_basic == results.true_eligible_basic) & ( results.est_eligible_concentration == results.true_eligible_concentration ) ) ).astype(int) results["became_eligible"] = ( ( results.est_eligible_targeted.astype(bool) & ~results.true_eligible_targeted.astype(bool) ) | ( results.est_eligible_basic.astype(bool) & ~results.true_eligible_basic.astype(bool) ) | ( results.est_eligible_concentration.astype(bool) & ~results.true_eligible_concentration.astype(bool) ) ).astype(int) results["became_ineligible"] = ( ( ~results.est_eligible_targeted.astype(bool) & results.true_eligible_targeted.astype(bool) ) | ( ~results.est_eligible_basic.astype(bool) & results.true_eligible_basic.astype(bool) ) | ( ~results.est_eligible_concentration.astype(bool) & results.true_eligible_concentration.astype(bool) ) ).astype(int) results["switched_eligibility_dp"] = ( ~( (results.dpest_eligible_targeted == results.true_eligible_targeted) & (results.dpest_eligible_basic == results.true_eligible_basic) & ( results.dpest_eligible_concentration == results.true_eligible_concentration ) ) ).astype(int) results["became_eligible_dp"] = ( ( results.dpest_eligible_targeted.astype(bool) & ~results.true_eligible_targeted.astype(bool) ) | ( results.dpest_eligible_basic.astype(bool) & ~results.true_eligible_basic.astype(bool) ) | ( results.dpest_eligible_concentration.astype(bool) & ~results.true_eligible_concentration.astype(bool) ) ).astype(int) results["became_ineligible_dp"] = ( ( ~results.dpest_eligible_targeted.astype(bool) & results.true_eligible_targeted.astype(bool) ) | ( ~results.dpest_eligible_basic.astype(bool) & results.true_eligible_basic.astype(bool) ) | ( ~results.dpest_eligible_concentration.astype(bool) & results.true_eligible_concentration.astype(bool) ) ).astype(int) results["dp_marginal"] = \ results["error_dp_per_child_eligible"] -\ results["error_per_child_eligible"] geo = get_geography() joined = geo.join( results[[ "error", "error_per_child", "error_per_child_eligible", "error_dp", "error_dp_per_child", "error_dp_per_child_eligible", "true_children_eligible", "true_pop_total", "percent_eligible", "true_grant_total", "switched_eligibility", "became_eligible", "became_ineligible", "switched_eligibility_dp", "became_eligible_dp", "became_ineligible_dp", "dp_marginal" ]] .groupby(["State FIPS Code", "District ID"]) .agg(['mean', 'std', 'sem', percentile(0.05)]), how="inner" ) joined.columns = [ col if isinstance(col, str) else '_'.join([ c for c in col if c != 'mean' ]).rstrip('_') for col in joined.columns.values ] for col in [ "error_per_child", "error_per_child_eligible", "error_dp_per_child", "error_dp_per_child_eligible" ]: joined.loc[ np.isinf(joined[col]), col ] = np.nan return joined
def get_geography() ‑> pandas.core.frame.DataFrame
-
Load shapefiles for LEAs.
Returns
pd.DataFrame
- Shapefiles indexed by school district.
Expand source code Browse git
def get_geography() -> pd.DataFrame: """Load shapefiles for LEAs. Returns: pd.DataFrame: Shapefiles indexed by school district. """ geo = gpd.read_file(os.path.join( config.root, "data/shapefiles/school_districts_19/schooldistrict_sy1819_tl19.shp" )) geo.STATEFP = geo.STATEFP.astype(int) geo["District ID"] = np.where( geo.UNSDLEA.notna(), geo.UNSDLEA, np.where( geo.SCSDLEA.notna(), geo.SCSDLEA, geo.ELSDLEA ) ) geo["District ID"] = geo["District ID"].astype(int) geo = geo.rename(columns={ "STATEFP": "State FIPS Code" }).set_index(["State FIPS Code", "District ID"]) return geo
def heatmap(data: pandas.core.frame.DataFrame, label: str = None, title: str = None, transform: str = 'cube', theme: str = 'seismic_r', y: str = 'error_dp_per_child', vcenter: int = 0, file: str = None, figsize: tuple = (10, 5), bar_location: str = 'bottom', min: int = None, max: int = None, dpi: int = 300, alpha: float = 0.1)
-
Plot heatmap of results.
Args
data
:pd.DataFrame
- Dataframe to plot from.
label
:str
, optional- Label for colorbar. Defaults to None.
title
:str
, optional- Title for plot. Defaults to None.
transform
:str
, optional- Transformation for data. Defaults to 'cube'.
theme
:str
, optional- Theme for colorbar. Defaults to "seismic_r".
y
:str
, optional- Column to plot. Defaults to "error_dp_per_child".
vcenter
:int
, optional- Center of colorbar. Defaults to 0.
file
:str
, optional- Filename. Defaults to None.
figsize
:tuple
, optional- Figure size. Defaults to (10, 5).
bar_location
:str
, optional- Where to place the colorbar. Defaults to 'bottom'.
min
:int
, optional- Minimum for colorbar. Defaults to None.
max
:int
, optional- Maximum for colorbar. Defaults to None.
dpi
:int
, optional- Figure DPI. Defaults to 300.
alpha
:float
, optional- Confidence level for t-test. Defaults to 0.1.
Expand source code Browse git
def heatmap( data: pd.DataFrame, label: str = None, title: str = None, transform: str = 'cube', theme: str = "seismic_r", y: str = "error_dp_per_child", vcenter: int = 0, file: str = None, figsize: tuple = (10, 5), bar_location: str = 'bottom', min: int = None, max: int = None, dpi: int = 300, alpha: float = 0.1 ): """Plot heatmap of results. Args: data (pd.DataFrame): Dataframe to plot from. label (str, optional): Label for colorbar. Defaults to None. title (str, optional): Title for plot. Defaults to None. transform (str, optional): Transformation for data. Defaults to 'cube'. theme (str, optional): Theme for colorbar. Defaults to "seismic_r". y (str, optional): Column to plot. Defaults to "error_dp_per_child". vcenter (int, optional): Center of colorbar. Defaults to 0. file (str, optional): Filename. Defaults to None. figsize (tuple, optional): Figure size. Defaults to (10, 5). bar_location (str, optional): Where to place the colorbar. Defaults to 'bottom'. min (int, optional): Minimum for colorbar. Defaults to None. max (int, optional): Maximum for colorbar. Defaults to None. dpi (int, optional): Figure DPI. Defaults to 300. alpha (float, optional): Confidence level for t-test. Defaults to 0.1. """ if alpha is not None: data[f"{y}_moe"] = data.loc[:, f"{y}_sem"] * \ stats.norm.ppf(1 - alpha / 2) sig = ~( ((data[y] + data[f"{y}_moe"]) >= 0) & ((data[y] - data[f"{y}_moe"]) <= 0) ) print( "All but", len(data)-sig.sum(), f"are significantly different from zero at {alpha}" ) fig, ax = plt.subplots(1, figsize=figsize, dpi=dpi) for key in [y, f"{y}_moe"] if alpha is not None else [y]: if transform == 'cube': data.loc[:, key] = cube(data[key]) if transform == 'log': data.loc[:, key] = np.where(data[key] == 0, 0, np.log(data[key])) if transform == 'sqrt': data.loc[:, key] = np.sign(data[key]) * np.sqrt(np.abs(data[key])) # Create colorbar as a legend if min is None: min = data[y].min() if max is None: max = data[y].max() bound = np.max(np.abs([min, max])) if vcenter is not None and transform != 'log': norm = pltc.TwoSlopeNorm(vcenter=0, vmin=-bound, vmax=bound) else: norm = pltc.Normalize(vmin=min, vmax=max) sm = plt.cm.ScalarMappable(cmap=theme, norm=norm) if alpha is not None: print( f"None of the {(1-alpha)*100}% MOEs exceeds", data[f"{y}_moe"].abs().max() ) data[f"{y}_sig"] = np.where(sig, data[y], np.nan) data.plot( column=f"{y}_sig" if alpha is not None else y, cmap=theme, norm=norm, ax=ax, linewidth=0.05, edgecolor='0.1', missing_kwds=dict( hatch='///', edgecolor=(0, 0, 0, 0.25), facecolor='none', label=f"Not significant (p < {alpha})" ), rasterized=True ) cb = fig.colorbar( sm, location=bar_location, shrink=0.5, pad=0.05, aspect=30 ) cb.set_label(label) if title is not None: plt.title(title) plt.axis('off') plt.tight_layout() if file is not None: plt.savefig( f"{config.root}/plots/geo/{file}", bbox_inches='tight', transparent=True, dpi=dpi ) # plt.savefig( # f"{config.root}/plots/geo/{file}_large", # dpi=dpi, bbox_inches='tight', # transparent=True # ) plt.close() else: plt.show()
def load_treatments(experiment_name: str, treatment_name: str = None) ‑> Union[Dict[str, pandas.core.frame.DataFrame], pandas.core.frame.DataFrame]
-
Load treatment results from memory.
Args
experiment_name
:str
- Experiment to load.
treatment_name
:str
, optional- Name of treatment to load. If not specified, loads all treatments. Defaults to None.
Returns
Dict[str, pd.DataFrame]
- Dictionary of treatments mapped to results,
or individual set of results if
treatment_name
is specified.
Expand source code Browse git
def load_treatments( experiment_name: str, treatment_name: str = None ) -> Union[Dict[str, pd.DataFrame], pd.DataFrame]: """Load treatment results from memory. Args: experiment_name (str): Experiment to load. treatment_name (str, optional): Name of treatment to load. If not specified, loads all treatments. Defaults to None. Returns: Dict[str, pd.DataFrame]: Dictionary of treatments mapped to results, or individual set of results if `treatment_name` is specified. """ treatments = pickle.load( open( f"{config.root}/results/policy_experiments/{experiment_name}.pkl", 'rb' ) ) if treatment_name is None: return treatments return treatments[treatment_name]
def match_true(df_true: pandas.core.frame.DataFrame, dfs_to_match: pandas.core.frame.DataFrame)
-
Equalize result baselines (columns with "true").
Args
df_true
:pd.DataFrame
- Dataframe with ground truth.
dfs_to_match
:pd.DataFrame
- Dataframes to equalize to
df_true
.
Expand source code Browse git
def match_true( df_true: pd.DataFrame, dfs_to_match: pd.DataFrame ): """Equalize result baselines (columns with "true"). Args: df_true (pd.DataFrame): Dataframe with ground truth. dfs_to_match (pd.DataFrame): Dataframes to equalize to `df_true`. """ for c in (c for c in df_true.columns if "true" in c): for df in dfs_to_match: df.loc[:, c] = df_true.loc[:, c]
def misalloc_statistics(error: pandas.core.series.Series, allocations: pandas.core.frame.DataFrame = None, grant_type: str = None)
-
Print statistics describing misallocation in a set of simulations.
Args
error
:pd.Series
- Error in allocation indexed by district.
allocations
:pd.DataFrame
, optional- Full set of allocations. Defaults to None.
grant_type
:str
, optional- Optionally, a grant type to print specific statistics for. Defaults to None.
Expand source code Browse git
def misalloc_statistics( error: pd.Series, allocations: pd.DataFrame = None, grant_type: str = None ): """Print statistics describing misallocation in a set of simulations. Args: error (pd.Series): Error in allocation indexed by district. allocations (pd.DataFrame, optional): Full set of allocations. Defaults to None. grant_type (str, optional): Optionally, a grant type to print specific statistics for. Defaults to None. """ err_grouped = error.groupby( ["State FIPS Code", "District ID"] ) exp_error = err_grouped.mean() print(f"# rows: {len(error)}") print(f"Max error: {np.abs(error).max()}") print("-- RMSE --") print(f"RMSE:", np.sqrt(np.mean(error**2))) print( "Avg. RMSE", np.mean( np.sqrt((error**2).groupby('trial').mean()) ) ) print( f"RMSE in exp. error:", np.sqrt(np.mean(exp_error**2)) ) print("-- Losses --") print( "Avg. (per trial) # of districts losing $$:", (error < 0).groupby("trial").sum().mean() ) print( f"Avg. total losses:", error[ error < 0 ].abs().groupby("trial").sum().mean() ) print( f"Std. total losses:", error[ error < 0 ].abs().groupby("trial").sum().std() ) print( "Total exp losses:", exp_error[exp_error < 0].abs().sum() ) print( "SD in total. exp. losses", np.sqrt(error.groupby( ["State FIPS Code", "District ID"] ).std()[exp_error < 0].pow(2).sum()) ) print( "Average exp loss", exp_error[exp_error < 0].abs().mean() ) lowerr = err_grouped.quantile(0.05) print( "Total 5% quantile losses:", lowerr[lowerr < 0].abs().sum() ) print( "Avg. 5% quantile loss:", lowerr[lowerr < 0].mean() ) print("-- Misalloc --") print( f"Avg. total abs misalloc:", error.abs().groupby("trial").sum().mean() ) print( f"Total exp abs misalloc:", exp_error.abs().sum() ) if allocations is not None: print("-- Other stats --") small_district = allocations["true_pop_total"]\ .groupby(["State FIPS Code", "District ID"])\ .first() < 20000 print( "# small districts:", small_district.sum() ) print( "Total exp misalloc to large districts:", exp_error[~small_district].abs().sum() ) print( "Total exp misalloc to small districts:", exp_error[small_district].abs().sum() ) if grant_type is not None: print( "Total true alloc:", allocations[f"true_grant_{grant_type}"] .groupby(["State FIPS Code", "District ID"]) .first().abs().sum() ) print( "Total true alloc per child eligible", allocations[f"true_grant_{grant_type}"] .groupby(["State FIPS Code", "District ID"]) .first().sum() / allocations[f"true_children_eligible"] .groupby(["State FIPS Code", "District ID"]) .first().sum() ) print("Average true alloc: {}".format( allocations[f"true_grant_{grant_type}"].mean() )) print( "Average true alloc per child eligible", ( allocations[f"true_grant_{grant_type}"] / allocations["true_children_eligible"] ).mean() ) print("Max true alloc: {}".format( allocations[f"true_grant_{grant_type}"].max() ))
def percentile(n)
-
Expand source code Browse git
def percentile(n): def percentile_(x): return x.quantile(n) percentile_.__name__ = 'percentile_{}'.format(n*100) return percentile_
def plot_treatments(treatments: Dict[str, pandas.core.frame.DataFrame], x_func: Callable, plot_method: Callable, plot_kwargs: dict, filename: str = None, xlab: str = None, ylab: str = 'Smoothed density', grant: str = 'total', epsilon: float = None, delta: float = None, mean_line: bool = False)
-
Plot treatment results.
Args
treatments
:Dict[str, pd.DataFrame]
- Treatment results mapped by treatment name.
x_func
:Callable
- Function to transform dataframe before plotting.
plot_method
:Callable
- Function for plotting.
plot_kwargs
:dict
- Parameters for plotting function.
filename
:str
, optional- Filename of plot. Defaults to None.
xlab
:str
, optional- X label for plot. Defaults to None.
ylab
:str
, optional- Y label for plot. Defaults to "Smoothed density".
grant
:str
, optional- Grant type to plot. Defaults to "total".
epsilon
:float
, optional- Epsilon to plot. Defaults to None.
delta
:float
, optional- Delta to plot. Defaults to None.
mean_line
:bool
, optional- Whether to include mean line. Defaults to False.
Expand source code Browse git
def plot_treatments( treatments: Dict[str, pd.DataFrame], x_func: Callable, plot_method: Callable, plot_kwargs: dict, filename: str = None, xlab: str = None, ylab: str = "Smoothed density", grant: str = "total", epsilon: float = None, delta: float = None, mean_line: bool = False ): """Plot treatment results. Args: treatments (Dict[str, pd.DataFrame]): Treatment results mapped by treatment name. x_func (Callable): Function to transform dataframe before plotting. plot_method (Callable): Function for plotting. plot_kwargs (dict): Parameters for plotting function. filename (str, optional): Filename of plot. Defaults to None. xlab (str, optional): X label for plot. Defaults to None. ylab (str, optional): Y label for plot. Defaults to "Smoothed density". grant (str, optional): Grant type to plot. Defaults to "total". epsilon (float, optional): Epsilon to plot. Defaults to None. delta (float, optional): Delta to plot. Defaults to None. mean_line (bool, optional): Whether to include mean line. Defaults to False. """ palette = sns.color_palette(n_colors=len(treatments)) for i, (treatment, df_raw) in enumerate(treatments.items()): df = df_raw.loc[pd.IndexSlice[ :, delta if delta is not None else slice(None), epsilon if epsilon is not None else slice(None), :, : ], :].copy() df.loc[:, "misalloc"] = \ df[f"dpest_grant_{grant}"] - df[f"true_grant_{grant}"] df.loc[:, "misalloc_sq"] = np.power(df["misalloc"], 2) if grant == "total": df["lost_eligibility"] = \ ( df["dpest_eligible_basic"].astype(bool) & ~df["true_eligible_basic"].astype(bool) ) |\ ( df["dpest_eligible_concentration"].astype(bool) & ~df["true_eligible_concentration"].astype(bool) ) |\ ( df["dpest_eligible_targeted"].astype(bool) & ~df["true_eligible_targeted"].astype(bool) ) else: df["lost_eligibility"] = \ ~df["dpest_eligible_{}".format(grant)].astype(bool) \ & df["true_eligible_{}".format(grant)].astype(bool) plot_kwargs.update({ 'label': treatment, 'color': palette[i] }) x = x_func(df) plot_method(x, **plot_kwargs) if mean_line: plt.axvline( x.mean(), color=palette[i], linestyle='dashed' ) plt.xlabel(xlab) plt.ylabel(ylab) plt.legend(loc='upper right') if filename: plt.savefig( f"{config.root}/plots/bootstrap/{filename}.pdf", transparent=True, bbox_inches='tight' ) plt.show() plt.close()
def save_treatments(treatments: Dict[str, pandas.core.frame.DataFrame], experiment_name: str)
-
Save treatment results.
Args
treatments
:Dict[str, pd.DataFrame]
- Dictionary of treatment names mapped to results.
experiment_name
:str
- Name for this collection of treatments.
Expand source code Browse git
def save_treatments( treatments: Dict[str, pd.DataFrame], experiment_name: str ): """Save treatment results. Args: treatments (Dict[str, pd.DataFrame]): Dictionary of treatment names mapped to results. experiment_name (str): Name for this collection of treatments. """ # minify treatments treatments = { treatment: df.loc[:, [ c for c in df.columns if c.endswith("- pct") or c.endswith("- est") or c.startswith("true") or "eligible" in c or "children_total" in c or "grant" in c or c in [ "ALAND" ] ]] for treatment, df in treatments.items() } pickle.dump( treatments, open( f"{config.root}/results/policy_experiments/{experiment_name}.pkl", 'wb' ) )