In-Class Exercise 9: Geographically Weighted Random Forest

Published

March 13, 2023

Modified

March 27, 2023

Imports

Packages

  • gwmodel for calibrate gw model

  • spatialml for calibrate random forest

  • ggpubr for stitch mutiple graph together

  • oslrr for calculating and viewing model diagnostics

  • devtools for importing packages not in CRAN (not needed)

  • tidymodels for creating ML workflows (not needed)

pacman::p_load(sf, GWmodel, SpatialML, tidyverse, tmap, ggpubr, oslrr, devtools, rsample)

Data

mdata <- read_rds("data/aspatial/mdata.rds")
set.seed(1234)
resale_split <- initial_split(mdata,
                              prop = 6.5/10)
train_data <- training(resale_split)
test_data <- testing(resale_split)
write_rds(train_data, "data/model/train_data.rds")
write_rds(test_data, "data/model/test_data.rds")
train_data <- read_rds("data/model/train_data.rds")
test_data <- read_rds("data/model/test_data.rds")

Building a non-spatial multiple linear regression (with OLS method)

price_mlr <- lm(resale_price ~ floor_area_sqm +

storey_order + remaining_lease_mths +

PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER +

PROX_MRT + PROX_PARK + PROX_MALL +

PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +

WITHIN_350M_CHILDCARE + WITHIN_350M_BUS +

WITHIN_1KM_PRISCH,

data=train_data)

summary(price_mlr)

price_mlr <- lm(resale_price ~ floor_area_sqm +
                  storey_order + remaining_lease_mths +
                  PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER +
                  PROX_MRT + PROX_PARK + PROX_MALL +
                  PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
                  WITHIN_350M_CHILDCARE + WITHIN_350M_BUS +
                  WITHIN_1KM_PRISCH,
                data=train_data)
summary(price_mlr)

Call:
lm(formula = resale_price ~ floor_area_sqm + storey_order + remaining_lease_mths + 
    PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER + PROX_MRT + PROX_PARK + 
    PROX_MALL + PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN + 
    WITHIN_350M_CHILDCARE + WITHIN_350M_BUS + WITHIN_1KM_PRISCH, 
    data = train_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-205193  -39120   -1930   36545  472355 

Coefficients:
                           Estimate Std. Error t value Pr(>|t|)    
(Intercept)              107601.073  10601.261  10.150  < 2e-16 ***
floor_area_sqm             2780.698     90.579  30.699  < 2e-16 ***
storey_order              14299.298    339.115  42.167  < 2e-16 ***
remaining_lease_mths        344.490      4.592  75.027  < 2e-16 ***
PROX_CBD                 -16930.196    201.254 -84.124  < 2e-16 ***
PROX_ELDERLYCARE         -14441.025    994.867 -14.516  < 2e-16 ***
PROX_HAWKER              -19265.648   1273.597 -15.127  < 2e-16 ***
PROX_MRT                 -32564.272   1744.232 -18.670  < 2e-16 ***
PROX_PARK                 -5712.625   1483.885  -3.850 0.000119 ***
PROX_MALL                -14717.388   2007.818  -7.330 2.47e-13 ***
PROX_SUPERMARKET         -26881.938   4189.624  -6.416 1.46e-10 ***
WITHIN_350M_KINDERGARTEN   8520.472    632.812  13.464  < 2e-16 ***
WITHIN_350M_CHILDCARE     -4510.650    354.015 -12.741  < 2e-16 ***
WITHIN_350M_BUS             813.493    222.574   3.655 0.000259 ***
WITHIN_1KM_PRISCH         -8010.834    491.512 -16.298  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 61650 on 10320 degrees of freedom
Multiple R-squared:  0.7373,    Adjusted R-squared:  0.737 
F-statistic:  2069 on 14 and 10320 DF,  p-value: < 2.2e-16

Can use this report for looking at generic trend, but don’t report these values (?)

GWR Methods

GWR Predictive model

First, need to change to spatial

train_date_sp <- as_Spatial(train_data)
test_date_sp <- as_Spatial(test_data)

Computing adaptive bandwidths

Refer to week 9 HOE

GWR Random Forest (using SpatialML)

Preparing coordinates data

# Extracting and save geometric data in coordinate files
coords <- st_coordinates(mdata)
coords_train <- st_coordinates(train_data)
coords_test <- st_coordinates(test_data)
write_rds(coords_train, "data/model/coords_train.rds")
write_rds(coords_test, "data/model/coords_test.rds")
coords_train <- read_rds("data/model/coords_train.rds")
coords_test <- read_rds("data/model/coords_test.rds")

Drop Geometry Field

train_data <- train_data %>%
  st_drop_geometry()

Calibrate Random Forest

rf <- ranger(resale_price ~ floor_area_sqm +
                  storey_order + remaining_lease_mths +
                  PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER +
                  PROX_MRT + PROX_PARK + PROX_MALL +
                  PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
                  WITHIN_350M_CHILDCARE + WITHIN_350M_BUS +
                  WITHIN_1KM_PRISCH,
                data=train_data)
print(rf)
Ranger result

Call:
 ranger(resale_price ~ floor_area_sqm + storey_order + remaining_lease_mths +      PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER + PROX_MRT + PROX_PARK +      PROX_MALL + PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +      WITHIN_350M_CHILDCARE + WITHIN_350M_BUS + WITHIN_1KM_PRISCH,      data = train_data) 

Type:                             Regression 
Number of trees:                  500 
Sample size:                      10335 
Number of independent variables:  14 
Mtry:                             3 
Target node size:                 5 
Variable importance mode:         none 
Splitrule:                        variance 
OOB prediction error (MSE):       735115316 
R squared (OOB):                  0.9491221 

node size -> min is 5 per node, can increase if you want

OOB -> mean squared error, is different from the residual standard error in the OSL (is the square root of the standard error), they are not comparable. need to square root MSE if want to compare

Compared based on r squared is easier

GWR RF (Adaptive)

# gwRF_adaptive <- grf(resale_price ~ floor_area_sqm +
#                   storey_order + remaining_lease_mths +
#                   PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER +
#                   PROX_MRT + PROX_PARK + PROX_MALL +
#                   PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
#                   WITHIN_350M_CHILDCARE + WITHIN_350M_BUS +
#                   WITHIN_1KM_PRISCH,
#                 dframe=train_data,
#                 bw=55,
#                 kernel="adaptive",
#                 coords=coords_train)
# Save model
# write_rds(gwRF_adaptive, "data/model/gwRF_adaptive.rds")
gwRF_adaptive <- read_rds("data/model/gwRF_adaptive.rds")
# View the "importance" of each var (the weights)
vi_df <- as.data.frame(gwRF_adaptive$Global.Model$variable.importance)

Predicting using test data

Prep test data

test_data <- cbind(test_data, coords_test) %>% st_drop_geometry()

Predict on GWR RF adaptive

# gwRF_pred <- predict.grf(gwRF_adaptive,
#                          test_data,
#                          x.var.name="X",
#                          y.var.name="Y",
#                          local.w=1,
#                          global.w=0)

Ouput is a vector