<- ca_df %>%
ca_dft st_drop_geometry() %>%
as_tibble()
<- df %>%
model_df # Modify education
mutate(education_num = as.numeric(education_num)) %>%
mutate(education = case_when(
< 3 ~ "lower",
education_num > 4 ~ "higher",
education_num TRUE ~ "middle"
),
# Account for changes in wave7 encoding
education = case_when(
== "wave_7" & education_num == 2 ~ "middle",
wave == "wave_7" & education_num == 3 ~ "higher",
wave TRUE ~ education
),
education = factor(education, levels = c("higher", "middle", "lower"))) %>%
# Modify income
drop_na() %>%
mutate(income_num = as.numeric(income_num)) %>%
mutate(income = case_when(
< 4 ~ "low",
income_num > 7 ~ "high",
income_num TRUE ~ "middle"
),
# Account for changes in wave7 encoding
income = case_when(
== "wave_7" & income_num == 1 ~ "low",
wave == "wave_7" & income_num == 2 ~ "middle",
wave == "wave_7" & income_num == 3 ~ "high",
wave TRUE ~ income
),
income = factor(income, levels = c("high", "middle", "low"))) %>%
# Modify age
mutate(age_num = case_when(
== "1" ~ "16-29",
age_num == "2" ~ "30-39",
age_num == "3" ~ "50 and above"
age_num %>%
))
left_join(feature_summarize(ca_dft, "ren")) %>%
left_join(feature_summarize(ca_dft, "dis_F")) %>%
left_join(feature_summarize(ca_dft, "temp")) %>%
left_join(feature_summarize(ca_dft, "co2_2")) %>%
left_join(feature_summarize(ca_dft, "gdp_2")) %>%
left_join(ca_dft %>% select(iso_a2, lifeExp), by = "iso_a2") %>%
drop_na() %>%
filter(env_opinion != "3")
Modelling
We have used exploratory data analysis to better understand what data we have and what can be done with it. In this section we will focus on building a model to predict the support of environmental protection over economic growth.
We are doing this with support of tidymodels and XGBoost!
XGBOOST
There are several steps to create a useful model, including parameter estimation, model selection and tuning, and performance assessment.
First, is selecting the most appropriate variables for prediction.
In this section, we select the most appropriate variables based on previous EDA,
Fit xg_boost model
For the first part, data is split into two distinct sets, the training set and the test set. The training set (typically larger) is used to develop and optimize the model by fitting different models and investigating various feature engineering strategies etc.
The other portion of the data is the test set. This is held in reserve until one or two models are chosen as the methods that are most likely to succeed.
# Load tidymodels and xgboost
library(tidymodels)
library(xgboost)
set.seed(2056)
# Split data
<- model_df %>%
df_split mutate(env_opinion = factor(env_opinion, levels = c("1", "2"))) %>%
initial_split(prop = 0.75)
# Extract train and test sets
= training(df_split)
train = testing(df_split)
test ::glue(
glue'The training set has {nrow(train)} observations \n',
'The testing set has {nrow(test)} observations'
)# Create resamples for model assessment
= vfold_cv(train, v = 3) train_folds
Create preprocessor
Feature engineering entails reformatting predictor values to make them easier for a model to use effectively.
# Function for prepping and baking a recipe
<- function(recipe){
prep_juice %>%
recipe prep() %>%
juice()
}<- recipe(
boost_recipe ~ age_num + education + income +
env_opinion + dis_F + temp +co2_2 +gdp_2+ lifeExp, data = train) %>%
ren # Pool infrequently occurring values into an "other" category.
step_other(age_num, threshold = 0.05) %>%
step_other(contains("age_num"), threshold = 0.05) %>%
# Encode dummy variables
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
# Near zero variance filter
step_nzv(all_predictors())
# Just for sanity check
#View(prep_juice(boost_recipe))
# Create boosted tree model specification
<- boost_tree(
boost_spec #mtry = tune(),
trees = 50,
#min_n = tune(),
#tree_depth = tune(),
learn_rate = 0.01,
#loss_reduction = tune(),
#sample_size = tune(),
#stop_iter = tune()
%>%
) set_engine("xgboost") %>%
set_mode("classification")
# Bind recipe and model specification together
<- workflow() %>%
boost_wf add_recipe(boost_recipe) %>%
add_model(boost_spec)
# Print workflow
boost_wf
Model training
Boosted tree models typically have tuning parameters or hyperparameters must be specified ahead of time and can't be directly found from training data. These are unknown structural or other kind of values that have significant impact on the model but cannot be directly estimated from the data. Instead, hyperparameters are estimated using simulated data sets created from a process called resampling such as cross-validation or bootstrap resampling.
::registerDoParallel()
doParallelset.seed(2056)
#library(finetune)
# Evaluation metrics during tuning
<- metric_set(mn_log_loss, accuracy)
eval_metrics # xgb_race <- tune_grid(boost_wf, resamples = train_folds, grid = 7, metrics = eval_metrics)
# # Efficient grid search via racing
<- tune_race_anova(
xgb_race object = boost_wf,
resamples = train_folds,
metrics = eval_metrics,
# Try out 20 different hyperparameter combinations
grid = 20,
control = control_race(
verbose_elim = TRUE
) )
# Tibble with model with best accuracy
%>%
xgb_race show_best(metric = "accuracy")
# Train model
# Train then test
# Finalize workflow
<- boost_wf %>%
final_boost_wf finalize_workflow(select_best(xgb_race, metric = "mn_log_loss" #mn_log_loss
))# Train then test
<- final_boost_wf %>%
xgb_model last_fit(df_split, metrics = metric_set(accuracy, recall, spec, ppv, roc_auc, mn_log_loss, f_meas))
# Collect metrics
%>%
xgb_model collect_metrics()
Evaluate model performance
# Plot confusion matrix
%>%
xgb_model collect_predictions() %>%
conf_mat(truth = env_opinion, estimate = .pred_class) %>%
autoplot(type = "heatmap")
# Prettier?
update_geom_defaults(geom = "rect", new = list(fill = "midnightblue", alpha = 0.7))
%>%
xgb_model collect_predictions() %>%
conf_mat(env_opinion, .pred_class) %>%
autoplot()
The performance metrics considered are:=
Recall: TP/(TP + FN)
defined as the proportion of positive results out of the number of samples which were actually positive. Also known as sensitivity
.
Specificity: TN/(TN + FP)
defined as the proportion of negative results out of the number of samples which were actually negative.
Precision: TP/(TP + FP)
defined as the proportion of predicted positives that are actually positive. Also called positive predictive value
Accuracy: TP + TN/(TP + TN + FP + FN)
The percentage of labels predicted accurately for a sample.
# Collect metrics
%>%
xgb_model collect_metrics() %>%
filter(.metric %in% c("accuracy", "recall", "f_meas", "mn_log_loss"))
Insights from modeling results:
The model was able to correctly distinguish 60% of the respondents
The model has a high ability to accurately distinguish respondents who support climate action.
F Measure: A weighted average of the precision and recall, with best being 1 and worst being 0.
Overall, for only tuning the number of trees, the model performed quite decently.
Model interpretability
In this section, we explore why the model makes the predictions it does.
options(scipen = 999)
# Extract trained workflow
<- xgb_model %>%
xgb_wf extract_workflow()
# Extract variable importance
library(vip)
<- xgb_wf %>%
vi extract_fit_parsnip() %>%
vi()
vi
Let’s visualize these model interpretability results
%>%
vi slice_max(Importance, n = 42) %>%
mutate(Variable = fct_reorder(Variable, Importance)) %>%
ggplot(mapping = aes(y = Variable, x = Importance)) +
geom_point(size = 3, color = "dodgerblue") +
geom_segment(aes(y = Variable, yend = Variable, x = 0, xend = Importance), size = 2, color = "dodgerblue", alpha = 0.7 ) +
ggtitle(paste("Variable Importance plot of top", round(nrow(vi)/2), "variables")) +
theme(plot.title = element_text(hjust = 0.5))
# SHAP for xgboost
library(SHAPforxgboost)
# Prepare shap values for plotting. Requires a matrix
<- shap.prep(
opinion_shap # Actual Boost engine
xgb_model = xgb_wf %>%
extract_fit_engine(),
# predictors used to calculate SHAP values
X_train = boost_recipe %>%
prep() %>% bake(has_role("predictor"),
new_data = NULL,
composition = "matrix"),
top_n = 8
)shap.plot.summary(opinion_shap)
Final comments
From the VIP plot, we can observe which variables most/least influence the model performance.
The life expectancy of the country where an individual lives in has the most predictive value for this model. This alludes to if the living conditions that lead to higher life expectancies such as health, diet/nutrition, respondents are more likely to support climate action.
The proportion of renewable energy consumed by respondents was the second most predictive feature. Perhaps if people experience the viability of renewable energy sources and the positive effects on their cities etc, they are more likely to support climate action.
Other factors directly related to climate change are also considered important by the model in predicting opinion. As people observe effects of climate change, this would influence their thoughts.
Higher education of literacy and certain age groups are important for the model.
Next steps
Consider SHAP summary plot that combines feature importance and feature effects with features being ordered according to their importance.
Explore Local model explanations which provide information about a prediction for a single observation and how changing a feature can simulate change in opinion.