--- title: "mlspatial:Spatial Machine Learning workflow" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{mlspatial} %\VignetteEngine{knitr::rmarkdown} editor_options: chunk_output_type: console --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", warning = FALSE, message = FALSE ) ``` ```{r} olddir <- getwd() setwd(tempdir()) setwd(olddir) ``` ```{r} oldopt <- options(digits = 4) options(oldopt) ``` # Introduction The `mlspatial` package provides an integrated framework for spatial epidemiological analysis by combining geospatial data processing, machine learning, and spatial statistical methods within a unified workflow. It is designed to support the analysis and prediction of disease patterns across geographic regions using both traditional spatial techniques and modern predictive modelling approaches. The package enables users to build end-to-end spatial modelling pipelines, including data preprocessing, thematic mapping, and the application of machine learning algorithms such as Random Forest, XGBoost, and Support Vector Regression. In addition, it incorporates spatial autocorrelation diagnostics, including Moran’s I and Local Indicators of Spatial Association (LISA), to assess geographic clustering and dependencies in health outcomes. This vignette demonstrates the core functionality of `mlspatial` using a synthetic spatial epidemiology dataset, illustrating how machine learning models can be applied to predict disease risk and how spatial patterns can be visualised and interpreted. # Environment Setup We begin by loading the required libraries and accessing example spatial data bundled with the package. A temporary working directory is also defined to store model outputs, visualisations, and intermediate results generated during the analysis workflow. ```{r setup} knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) library(mlspatial) library(dplyr) library(ggplot2) library(tmap) library(sf) library(spdep) library(randomForest) library(xgboost) library(e1071) library(caret) library(tidyr) ``` # Data Loading and Spatial Integration Spatial epidemiological analyses require the integration of geographic boundaries with disease and demographic data. The mlspatial package provides a streamlined workflow for loading, visualising, and merging spatial datasets: ## Loading Spatial Data: Spatial objects (e.g., shapefiles) can be imported and quickly visualised using built-in plotting functions or compatible mapping libraries. ## Exploratory Mapping: Users can generate thematic maps with country or region labels to inspect geographic structure and verify data integrity. ## Data Integration: Epidemiological datasets (e.g., incidence, prevalence, mortality) are merged with spatial objects using join_data() or standard dplyr joins, enabling downstream modelling and mapping. ```{r} # Quick thematic map with country labels plot(africa_shp) qtm(africa_shp) qtm(africa_shp, text = "NAME") # Replace with actual field name tm_shape(africa_shp) + tm_polygons() + tm_text("NAME", size = 0.5) + # Replace with correct column tm_title ("Africa Countries") ggplot(africa_shp) + geom_sf(fill = "lightgray", color = "black") + geom_sf_text(aes(label = NAME), size = 2) + # Replace as needed ggtitle("Africa countries") + theme_minimal() ``` ```{r} # Join data mapdata <- join_data(africa_shp, panc_incidence, by = "NAME") ## OR Joining/ merging my data and shapefiles mapdata <- inner_join(africa_shp, panc_incidence, by = "NAME") ## OR mapdata <- left_join(nat, codata, by = "DISTRICT_N") str(mapdata) ``` # Thematic Mapping of Disease Incidence Visualising disease incidence across geographic regions is a key step in spatial epidemiology. The `mlspatial` package supports the creation of high-quality thematic maps that highlight spatial variation across demographic groups. In this example, we generate a series of quantile-based choropleth maps to visualise pancreatic cancer incidence across countries, stratified by sex and age groups. # Quantile Mapping: Values are classified into quantiles to ensure balanced visual representation across regions. # Multiple Indicators: Maps are generated for overall incidence, sex-specific incidence, and age-stratified subgroups. #Layout Arrangement: Individual maps are combined into a grid layout for comparative visual analysis. ## Interpretation These maps provide a comparative view of pancreatic cancer incidence across countries and demographic subgroups. Quantile classification ensures that spatial differences are visually balanced, making it easier to identify high-risk and low-risk regions. Stratified maps further reveal how disease burden varies by age and sex, supporting more targeted epidemiological insights and public health interventions. ```{r} #Visualize Pancreatic cancer Incidence by countries #Basic map with labels # quantile map p1 <- tm_shape(mapdata) + tm_fill("incidence", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), fill.legend = tm_legend(title = "Incidence")) + tm_borders(fill_alpha = .3) + tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE) p2 <- tm_shape(mapdata) + tm_fill("female", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), fill.legend = tm_legend(title = "Female")) + tm_borders(fill_alpha = .3) + tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE) p3 <- tm_shape(mapdata) + tm_fill("male", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), fill.legend = tm_legend(title = "Male")) + tm_borders(fill_alpha = .3) + tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE) p4 <- tm_shape(mapdata) + tm_fill("ageb", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), fill.legend = tm_legend(title = "Age:20-54yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE) p5 <- tm_shape(mapdata) + tm_fill("agec", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), fill.legend = tm_legend(title = "Age:55+yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE) p6 <- tm_shape(mapdata) + tm_fill("agea", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), fill.legend = tm_legend(title = "Age:<20yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE) p7 <- tm_shape(mapdata) + tm_fill("fageb", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), fill.legend = tm_legend(title = "Female:20-54yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE) p8 <- tm_shape(mapdata) + tm_fill("fagec", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), fill.legend = tm_legend(title = "Female:55+yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE) p9 <- tm_shape(mapdata) + tm_fill("fagea", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), fill.legend = tm_legend(title = "Female:<20yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE) p10 <- tm_shape(mapdata) + tm_fill("mageb", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), fill.legend = tm_legend(title = "Male:20-54yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE) p11 <- tm_shape(mapdata) + tm_fill("magec", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), fill.legend = tm_legend(title = "Male:55+yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE) p12 <- tm_shape(mapdata) + tm_fill("magea", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), fill.legend = tm_legend(title = "Male:<20yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE) current.mode <- tmap_mode("plot") tmap_arrange(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, widths = c(.75, .75)) tmap_mode(current.mode) ``` # Machine Learning Modelling The mlspatial package integrates multiple machine learning algorithms to model spatial variation in epidemiological outcomes. Users can train predictive models using structured demographic, temporal, and geographic features. ## Model Training: Functions such as train_rf(), train_xgb(), and train_svr() allow users to fit Random Forest, XGBoost, and Support Vector Regression models, respectively. ## Prediction: Trained models can be used to generate predicted disease outcomes across geographic regions. ## Model Comparison: Multiple models can be evaluated to identify the best-performing approach for a given dataset. ```{r} ## Machine Learning Model building # 1. Random Forest Regression # Train Random Forest set.seed(123) rf_model <- randomForest(incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec + magea + mageb + magec + yrb + yrc + yrd + yre, data = mapdata, ntree = 500, importance = TRUE) # View model results print(rf_model) varImpPlot(rf_model) #Plot Variable Importance library(ggplot2) importance_df <- data.frame( Variable = rownames(importance(rf_model)), Importance = importance(rf_model)[, "IncNodePurity"]) ggplot(importance_df, aes(x = reorder(Variable, Importance), y = Importance)) + geom_bar(stat = "identity", fill = "steelblue") + coord_flip() + labs(title = "Variable Importance (IncNodePurity)", x = "Variable", y = "Importance") # 2. Gradient Boosting Machine (XGBoost) # Prepare the data x_vars <- model.matrix(incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec + magea + mageb + magec + yrb + yrc + yrd + yre, data = mapdata)[,-1] y <- mapdata$incidence # Convert to DMatrix dtrain <- xgb.DMatrix(data = x_vars, label = y) # Train model # Train model using xgb.train() params <- list( objective = "reg:squarederror", max_depth = 4, learning_rate = 0.1, verbosity = 0 ) xgb_model <- xgb.train( params = params, data = dtrain, nrounds = 100 ) # Feature importance xgb.importance(model = xgb_model) # 3. Support Vector Regression (SVR) # Train SVR model svr_model <- svm(incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec + magea + mageb + magec + yrb + yrc + yrd + yre, data = mapdata, type = "eps-regression", kernel = "radial") # Summary and predictions summary(svr_model) #mapdata$pred_svr <- predict(svr_model) ``` ## Model Evaluation To assess predictive performance, mlspatial provides functions to compute standard regression metrics and compare models. # Performance Metrics: Evaluate models using RMSE, MAE, and R². # Cross-Validation: Built-in support for k-fold cross-validation improves robustness and prevents overfitting. ```{r} # Model Evaluation (predictions): # evaluate each model step-by-step: # 1. Random Forest Evaluation rf_preds <- predict(rf_model, newdata = mapdata) rf_metrics <- postResample(pred = rf_preds, obs = mapdata$incidence) print(rf_metrics) # 2. XGBoost Evaluation xgb_preds <- predict(xgb_model, newdata = x_vars) xgb_metrics <- postResample(pred = xgb_preds, obs = mapdata$incidence) print(xgb_metrics) # 3. SVR Evaluation svr_preds <- predict(svr_model, newdata = mapdata) svr_metrics <- postResample(pred = svr_preds, obs = mapdata$incidence) print(svr_metrics) # Compare All Models model_eval <- data.frame( Model = c("Random Forest", "XGBoost", "SVR"), RMSE = c(rf_metrics["RMSE"], xgb_metrics["RMSE"], svr_metrics["RMSE"]), MAE = c(rf_metrics["MAE"], xgb_metrics["MAE"], svr_metrics["MAE"]), Rsquared = c(rf_metrics["Rsquared"], xgb_metrics["Rsquared"], svr_metrics["Rsquared"])) print(model_eval) #Choosing the Best Model #Lowest RMSE and MAE = most accurate predictions #Highest R² = best variance explanation #Sort and interpret: model_eval[order(model_eval$RMSE), ] # Best = Top row ``` # Model Prediction Visualisation Visualising model predictions is essential for understanding model behaviour, comparing performance across algorithms, and assessing predictive accuracy. The `mlspatial` package supports multiple approaches for exploring predicted outputs. # Prediction Trends Across Observations To compare how different models behave across observations, predictions from multiple models can be combined and visualised as line plots. ## Reshaping Predictions: Predictions from each model are combined into a long format for flexible plotting. ## Trend Visualisation: Line plots allow comparison of prediction patterns across models. ## Observed vs Predicted Comparison Comparing observed and predicted values provides a direct assessment of model accuracy. Ideally, predictions should align closely with the 45-degree reference line. ## Observed vs Predicted with Correlation To enhance interpretability, scatter plots can be combined with correlation statistics (e.g., Pearson R²), providing a quantitative measure of model performance. ## Interpretation These visualisations provide complementary insights into model performance. Line plots reveal differences in prediction behaviour across observations, while observed vs predicted plots assess accuracy and bias. The inclusion of correlation statistics further quantifies model performance, helping identify the most reliable algorithm for spatial epidemiological prediction. ```{r} #### Models Predicted plots # Create a data frame from your table model_preds <- data.frame(rf_preds, xgb_preds, svr_preds) # Add observation ID model_preds$ID <- 1:nrow(model_preds) # Reshape for plotting model_long <- model_preds %>% tidyr::pivot_longer(cols = c("rf_preds", "xgb_preds", "svr_preds"), names_to = "Model", values_to = "Predicted") # Plot ggplot(model_long, aes(x = ID, y = Predicted, color = Model)) + geom_line(linewidth = 0.5) + labs(title = "Model Predictions Over Observations", x = "Observation", y = "Predicted Incidence") + theme_minimal() ## plot actual vs predicted oldpar <- par(mfrow = c(1, 3)) # 3 plots side-by-side # Random Forest plot(mapdata$incidence, mapdata$rf_pred, xlab = "Observed", ylab = "Predicted", main = "Random Forest", pch = 19, col = "steelblue") abline(0, 1, col = "red", lwd = 2) # XGBoost plot(mapdata$incidence, mapdata$xgb_pred, xlab = "Observed", ylab = "Predicted", main = "XGBoost", pch = 19, col = "darkgreen") abline(0, 1, col = "red", lwd = 2) # SVR plot(mapdata$incidence, mapdata$svr_pred, xlab = "Observed", ylab = "Predicted", main = "SVR", pch = 19, col = "purple") abline(0, 1, col = "red", lwd = 2) par(oldpar) ## Actual vs predicted plot with correlations library(ggplot2) library(ggpubr) # For stat_cor mapdata$rf_pred <- predict(rf_model) mapdata$xgb_pred <- predict(xgb_model, newdata = x_vars) mapdata$svr_pred <- predict(svr_model, newdata = mapdata) # Random Forest Plot p1 <- ggplot(mapdata, aes(x = incidence, y = rf_pred)) + geom_point(color = "steelblue", alpha = 0.6) + geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") + stat_cor(method = "pearson", aes(label = paste0("R² = ")), label.x = 0) + labs(title = "Random Forest", x = "Observed Incidence", y = "Predicted Incidence") + theme_minimal() # XGBoost Plot p2 <- ggplot(mapdata, aes(x = incidence, y = xgb_pred)) + geom_point(color = "darkgreen", alpha = 0.6) + geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") + stat_cor(method = "pearson", aes(label = paste0("R² = ")), label.x = 0) + labs(title = "XGBoost", x = "Observed Incidence", y = "Predicted Incidence") + theme_minimal() # SVR Plot p3 <- ggplot(mapdata, aes(x = incidence, y = svr_pred)) + geom_point(color = "purple", alpha = 0.6) + geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") + stat_cor(method = "pearson", aes(label = paste0("R² = ")), label.x = 0) + labs(title = "SVR", x = "Observed Incidence", y = "Predicted Incidence") + theme_minimal() combined_plot <- ggarrange(p1, p2, p3, ncol = 3, nrow = 1, common.legend = FALSE) ``` ## Cross-Validation of Predictive Models Cross-validation is a critical step for evaluating model performance and ensuring robust, generalizable predictions. The `mlspatial` workflow supports multiple algorithms, including Random Forest (RF), XGBoost (XGB), and Support Vector Regression (SVR). # Step 1: Setup We first define a reproducible environment and cross-validation strategy. # Step 2: Random Forest Cross-Validation Random Forest models benefit from tuning key parameters while evaluating predictive performance using cross-validation. # Step 3: XGBoost Cross-Validation For XGBoost, we convert data to numeric and use xgb.cv to find the optimal number of boosting rounds while monitoring RMSE. # Step 4: Support Vector Regression (SVR) Cross-Validation SVR often benefits from feature scaling. We apply center/scale preprocessing during cross-validation. # Interpretation - Random Forest provides variable importance metrics and robust performance across folds. - XGBoost uses gradient boosting to optimize predictive accuracy and can be fine-tuned via early stopping. - SVR is sensitive to feature scaling but can capture non-linear relationships effectively. These cross-validation procedures quantify expected out-of-sample performance, allowing comparison between models and selection of the best-performing algorithm for spatial epidemiology predictions. ```{r} ## CROSS-VALIDATION #Step 1: Prepare common setup # Set seed for reproducibility set.seed(123) library(caret) # Define 5-fold cross-validation cv_control <- trainControl(method = "cv", number = 3) # 1. Random Forest library(randomForest) rf_cv <- train( incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec + magea + mageb + magec + yrb + yrc + yrd + yre, data = mapdata, method = "rf", trControl = cv_control, tuneLength = 3, importance = TRUE ) print(rf_cv) # 2. XGBoost library(xgboost) mapdata <- st_drop_geometry(mapdata) mapdata$incidence <- as.numeric(mapdata$incidence) cv_control <- trainControl( method = "cv", number = 5 ) # XGBoost with caret xgb_cv <- xgb.cv( params = params, data = dtrain, nrounds = 100, nfold = 5, # 5-fold CV early_stopping_rounds = 10, # stop if no improvement in 10 rounds metrics = list("rmse"), verbose = 0 ) print(xgb_cv) best_nrounds <- xgb_cv$best_iteration cat("Best number of rounds:", best_nrounds, "\n") # Extract mean RMSE mean_rmse <- min(xgb_cv$evaluation_log$test_rmse_mean) cat("XGBoost CV RMSE:", mean_rmse, "\n") # 3. Support Vector Regression (SVR) library(e1071) library(kernlab) svr_cv <- train( incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec + magea + mageb + magec + yrb + yrc + yrd + yre, data = mapdata, method = "svmRadial", trControl = cv_control, preProcess = c("center", "scale"), # SVR often benefits from scaling tuneLength = 3 ) print(svr_cv) ``` ## Spatial Maps of Model Predictions The `mlspatial` package allows visualization of predicted incidence values across geographic regions. Here, we map predictions from Random Forest (RF), XGBoost (XGB), and Support Vector Regression (SVR) models using African country shapefiles and incidence data. # Step 1: Join Shapefiles and Incidence Data # Step 2: Add Predicted Values Predictions are added to the mapdata object for each model. # Step 3: Visualize Individual Model Maps Random Forest, XGBoost, and SVR # Step 4: Compare Models Side by Side To visually compare predicted spatial patterns across models: This approach allows users to compare spatial prediction patterns across models directly, facilitating interpretation and selection of the most appropriate predictive approach for regional incidence mapping. ```{r} ## Spatial maps of predicted values of each model # 1. Random Forest Spatial Map mapdata <- inner_join(africa_shp, panc_incidence, by = "NAME") mapdata$pred_rf <- predict(rf_model, newdata = mapdata) tm_shape(mapdata) + tm_fill("pred_rf", fill.scale =tm_scale_intervals(values = "brewer.greens", style = "quantile"), fill.legend = tm_legend(title = "Inci_pred_rf")) + tm_borders(fill_alpha = .2) + tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE) # 2. XGBoost Spatial Map # Ensure matrix used in training mapdata$pred_xgb <- predict(xgb_model, newdata = x_vars) tm_shape(mapdata) + tm_fill("pred_xgb", fill.scale =tm_scale_intervals(values = "brewer.purples", style = "quantile"), fill.legend = tm_legend(title = "Inci_pred_xgb")) + tm_borders(fill_alpha = .2) + tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE) # 3. Support Vector Regression (SVR) Spatial Map mapdata$pred_svr <- predict(svr_model, newdata = mapdata) tm_shape(mapdata) + tm_fill("pred_svr", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), fill.legend = tm_legend(title = "Inci_pred_svr")) + tm_borders(fill_alpha = .2) + tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE) # Compare Side by Side tmap_arrange( tm_shape(mapdata) + tm_fill("pred_rf", fill.scale =tm_scale_intervals(values = "brewer.greens", style = "quantile"), fill.legend = tm_legend(title = "Inci_pred_rf")) + tm_borders(fill_alpha = .2) + tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE), tm_shape(mapdata) + tm_fill("pred_xgb", fill.scale =tm_scale_intervals(values = "brewer.purples", style = "quantile"), fill.legend = tm_legend(title = "Inci_pred_xgb")) + tm_borders(fill_alpha = .2) + tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE), tm_shape(mapdata) + tm_fill("pred_svr", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), fill.legend = tm_legend(title = "Inci_pred_svr")) + tm_borders(fill_alpha = .2) + tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE), nrow = 1) ``` ## Residual Analysis After predicting pancreatic cancer incidence with Random Forest (RF), XGBoost (XGB), and Support Vector Regression (SVR) models, it is important to examine residuals to assess model fit and identify systematic biases. # Step 1: Compute Residuals Residuals are calculated as the difference between observed (mapdata$incidence) and predicted values for each model. # Step 2: Compare Residual Distributions A boxplot allows comparison of prediction errors across models. # Step 3: Map Residuals Spatially Adding residuals to mapdata enables visualization of geographic patterns in model errors. This approach allows users to: - Quantify model errors across the dataset (via boxplots). - Identify spatial patterns in residuals, which can reveal geographic regions where models over- or under-predict incidence. ```{r} # Predicted Residuals # we've already trained all 3 models and have `mapdata$incidence` as your actual values. ### Step 1: Generate predictions and residuals for each model # Random Forest rf_preds <- predict(rf_model, newdata = mapdata) rf_resid <- mapdata$incidence - rf_preds # XGBoost xgb_preds <- predict(xgb_model, newdata = x_vars) # x_vars = model.matrix(...) xgb_resid <- mapdata$incidence - xgb_preds # SVR svr_preds <- predict(svr_model, newdata = mapdata) svr_resid <- mapdata$incidence - svr_preds ### Step 2: Combine into a single data frame residuals_df <- data.frame( actual = mapdata$incidence, rf_pred = rf_preds, rf_resid = rf_resid, xgb_pred = xgb_preds, xgb_resid = xgb_resid, svr_pred = svr_preds, svr_resid = svr_resid ) # export library(writexl) ### Compare residual distributions boxplot(residuals_df$rf_resid, residuals_df$xgb_resid, residuals_df$svr_resid, names = c("RF", "XGB", "SVR"), main = "Model Residuals", ylab = "Prediction Error (Residual)") ``` ## Spatial Visualization of Model Residuals After computing prediction residuals for Random Forest (RF), XGBoost (XGB), and Support Vector Regression (SVR), we can visualize spatial patterns in model errors. This helps identify regions where models over- or under-predict pancreatic cancer incidence. # Step 1: Add residuals to the spatial data # Step 2: Set static plotting mode # Step 3: Create individual residual maps # Step 4: Combine residual maps in a grid This layout provides a side-by-side comparison of residuals from the three models, highlighting geographic regions where each model may over- or under-predict incidence. It complements boxplots and scatterplots for a comprehensive residual analysis. ```{r} ## Spatial maps of residual values from each model #Add residuals to mapdata #You should already have these from the previous steps: mapdata$rf_resid <- residuals_df$rf_resid mapdata$xgb_resid <- residuals_df$xgb_resid mapdata$svr_resid <- residuals_df$svr_resid # Set tmap mode to plot (static map) tmap_mode("plot") # Create individual residual maps map_rf <- tm_shape(mapdata) + tm_fill("rf_resid", fill.scale =tm_scale_intervals(values = "brewer.greens", style = "quantile", midpoint = 0), fill.legend = tm_legend(title = "Inci_rf_resid")) + tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE) map_xgb <- tm_shape(mapdata) + tm_fill("xgb_resid", fill.scale =tm_scale_intervals(values = "brewer.purples", style = "quantile", midpoint = 0), fill.legend = tm_legend(title = "Inci_xgb_resid")) + tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE) map_svr <- tm_shape(mapdata) + tm_fill("svr_resid", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), fill.legend = tm_legend(title = "Inci_svr_resid")) + tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE) #Step 3: Combine maps in a grid # Combine maps in a grid layout tmap_arrange(map_rf, map_xgb, map_svr, nrow = 1) ``` ## Barplot and Spatial maps for RMSE and MAE After training all three models, we evaluate their performance using Root Mean Squared Error (RMSE), Mean Absolute Error (MAE), and R². We also integrate these metrics into residual maps for spatial interpretation. # Step 1: Compute evaluation metrics # Step 2: Barplots of RMSE, MAE, and R² # Step 3: Annotate residual maps with performance metrics This section provides: - Barplots for overall RMSE, MAE, and R² across models. - Spatial maps showing residuals, annotated with performance metrics, to visually identify regions with higher prediction errors. ```{r} ##Barplot and Spatial maps for RMSE and MAE #Step 1: Calculate RMSE and MAE for each model # Random Forest rf_metrics <- postResample(pred = rf_preds, obs = mapdata$incidence) # XGBoost xgb_metrics <- postResample(pred = xgb_preds, obs = mapdata$incidence) # SVR svr_metrics <- postResample(pred = svr_preds, obs = mapdata$incidence) #Step 2: Combine into a summary data frame model_eval <- data.frame( Model = c("Random Forest", "XGBoost", "SVR"), RMSE = c(rf_metrics["RMSE"], xgb_metrics["RMSE"], svr_metrics["RMSE"]), MAE = c(rf_metrics["MAE"], xgb_metrics["MAE"], svr_metrics["MAE"]), Rsquared = c(rf_metrics["Rsquared"], xgb_metrics["Rsquared"], svr_metrics["Rsquared"]) ) print(model_eval) #Visualize MAE and RMSE oldpar <- par(mfrow = c(1, 3)) #Barplot of RMSE barplot(model_eval$RMSE, names.arg = model_eval$Model, col = "skyblue", las = 1, main = "Model RMSE", ylab = "RMSE") #Barplot of MAE barplot(model_eval$MAE, names.arg = model_eval$Model, col = "lightgreen", las = 1, main = "Model MAE", ylab = "MAE") #Barplot of MAE barplot(model_eval$Rsquared, names.arg = model_eval$Model, col = "grey", las = 1, main = "Model Rsquared", ylab = "MAE") par(oldpar) #Add metrics to residual maps as captions map_rf <- tm_shape(mapdata) + tm_fill("rf_resid", fill.scale =tm_scale_intervals(values = "brewer.greens", midpoint = 0), title = paste0("rf_resid\nRMSE: ", round(rf_metrics["RMSE"], 2), "\nMAE: ", round(rf_metrics["MAE"], 2))) + tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom")) map_xgb <- tm_shape(mapdata) + tm_fill("xgb_resid", fill.scale =tm_scale_intervals(values = "-RdBu", midpoint = 0), title = paste0("xgb_resid\nRMSE: ", round(xgb_metrics["RMSE"], 2), "\nMAE: ", round(xgb_metrics["MAE"], 2))) + tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom")) map_svr <- tm_shape(mapdata) + tm_fill("svr_resid", fill.scale =tm_scale_intervals(values = "-RdBu", midpoint = 0), title = paste0("svr_resid\nRMSE: ", round(svr_metrics["RMSE"], 2), "\nMAE: ", round(svr_metrics["MAE"], 2))) + tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom")) tmap_arrange(map_rf, map_xgb, map_svr, nrow = 1) ``` ## Spatial Autocorrelation Analysis Understanding spatial dependence is essential in epidemiology. The `mlspatial` package incorporates spatial statistical tools to quantify clustering and detect geographic patterns. # Global Spatial Autocorrelation: Moran’s I is used to assess overall spatial dependence. #Local Spatial Clustering: LISA identifies local clusters (e.g., high-high, low-low regions). ```{r} #Global Moran’s I, Local Moran’s I (LISA), Cluster categories (e.g., High-High, Low-Low), Maps: Moran’s I map, #LISA clusters, High-High clusters using the predicted values from the four machine learning models #Assumptions: Your spatial data is in mapdata (an sf object). #Predicted values for each model are already stored in: rf_preds, xgb_preds, svr_preds, mlp_preds #STEP 1: Create Spatial Weights (if not done yet) neighbors <- poly2nb(mapdata) #if this gives warning, use the below codes mapdata <- st_as_sf(mapdata) # If it's not already sf mapdata <- st_make_valid(mapdata) # Fix any invalid geometries neighbors <- poly2nb(mapdata, snap = 1e-15) ## Try 1e-6, 1e-5, or higher if needed. You can adjust snap upward incrementally until the warnings disappear or are reduced listw <- nb2listw(neighbors, style = "W", zero.policy = TRUE) #STEP 2: Define a function to compute Moran’s I and cluster categories analyze_spatial_autocorrelation <- function(mapdata, values, listw, model_name, signif_level = 0.05) { # Standardize predicted values mapdata$val_st <- scale(values)[, 1] # Compute lag mapdata$lag_val <- lag.listw(listw, mapdata$val_st, zero.policy = TRUE) # Global Moran's I global_moran <- moran.test(values, listw, zero.policy = TRUE) # Local Moran's I (LISA) lisa <- localmoran(values, listw, zero.policy = TRUE) lisa_df <- as.data.frame(lisa) #rename p-value column colnames(lisa_df)[5] <- "Pr_z" # Add to mapdata mapdata$Ii <- lisa_df$Ii mapdata$Z_Ii <- lisa_df$Z.I mapdata$Pr_z <- lisa_df$Pr_z #mapdata$Pr_z <- lisa_df[, "Pr(z > 0)"] # Classify clusters mapdata <- mapdata %>% mutate( cluster = case_when( val_st > 0 & lag_val > 0 & Pr_z <= signif_level ~ "High-High", val_st < 0 & lag_val < 0 & Pr_z <= signif_level ~ "Low-Low", val_st < 0 & lag_val > 0 & Pr_z <= signif_level ~ "Low-High", val_st > 0 & lag_val < 0 & Pr_z <= signif_level ~ "High-Low", TRUE ~ "Not Significant" ) ) return(list( map = mapdata, global_moran = global_moran )) } #STEP 3: Run the function for each model rf_result <- analyze_spatial_autocorrelation(mapdata, rf_preds, listw, "Random Forest") xgb_result <- analyze_spatial_autocorrelation(mapdata, xgb_preds, listw, "XGBoost") svr_result <- analyze_spatial_autocorrelation(mapdata, svr_preds, listw, "SVR") #STEP 4: Mapping LISA Clusters and High-High Areas for Random Forest tmap_mode("plot") # LISA Cluster Map. fill.scale =tm_scale_intervals(values = "-RdBu") tm_rf <- tm_shape(rf_result$map) + tm_fill( "cluster", fill.scale = tm_scale(values = c( "High-High" = "red", "Low-Low" = "blue", "Low-High" = "lightblue", "High-Low" = "pink", "Not Significant" = "gray90")), fill.legend = tm_legend(title = "LISA Clusters - RF")) + tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom")) # Mapping LISA Clusters and High-High Areas for XGBoost tmap_mode("plot") # LISA Cluster Map tm_xgb <- tm_shape(xgb_result$map) + tm_fill("cluster", fill.scale = tm_scale(values = c( "High-High" = "red", "Low-Low" = "blue", "Low-High" = "lightblue", "High-Low" = "pink", "Not Significant" = "gray90")), fill.legend = tm_legend(title = "LISA Clusters - XGBoost")) + tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom")) #Mapping LISA Clusters and High-High Areas for SVR tmap_mode("plot") # LISA Cluster Map tm_svr <- tm_shape(svr_result$map) + tm_fill("cluster", fill.scale = tm_scale(values = c( "High-High" = "red", "Low-Low" = "blue", "Low-High" = "lightblue", "High-Low" = "pink", "Not Significant" = "gray90")), fill.legend = tm_legend(title = "LISA Clusters - SVR")) + tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom")) tmap_arrange(tm_rf, tm_xgb, tm_svr, nrow = 1) #You can also arrange maps side-by-side using tmap_arrange(). #View Global Moran’s I Results #These print the test statistic and p-values for global spatial autocorrelation of predictions. rf_result$global_moran xgb_result$global_moran svr_result$global_moran ```