::p_load(sf, GWmodel, SpatialML, tidyverse, tmap, ggpubr, oslrr, devtools, rsample) pacman
In-Class Exercise 9: Geographically Weighted Random Forest
Imports
Packages
gwmodel for calibrate gw model
spatialml for calibrate random forest
ggpubr for stitch mutiple graph together
oslrr for calculating and viewing model diagnostics
devtools for importing packages not in CRAN (not needed)
tidymodels for creating ML workflows (not needed)
Data
<- read_rds("data/aspatial/mdata.rds") mdata
set.seed(1234)
<- initial_split(mdata,
resale_split prop = 6.5/10)
<- training(resale_split)
train_data <- testing(resale_split) test_data
write_rds(train_data, "data/model/train_data.rds")
write_rds(test_data, "data/model/test_data.rds")
<- read_rds("data/model/train_data.rds")
train_data <- read_rds("data/model/test_data.rds") test_data
Building a non-spatial multiple linear regression (with OLS method)
price_mlr <- lm(resale_price ~ floor_area_sqm +
storey_order + remaining_lease_mths +
PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER +
PROX_MRT + PROX_PARK + PROX_MALL +
PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
WITHIN_350M_CHILDCARE + WITHIN_350M_BUS +
WITHIN_1KM_PRISCH,
data=train_data)
summary(price_mlr)
<- lm(resale_price ~ floor_area_sqm +
price_mlr + remaining_lease_mths +
storey_order + PROX_ELDERLYCARE + PROX_HAWKER +
PROX_CBD + PROX_PARK + PROX_MALL +
PROX_MRT + WITHIN_350M_KINDERGARTEN +
PROX_SUPERMARKET + WITHIN_350M_BUS +
WITHIN_350M_CHILDCARE
WITHIN_1KM_PRISCH,data=train_data)
summary(price_mlr)
Call:
lm(formula = resale_price ~ floor_area_sqm + storey_order + remaining_lease_mths +
PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER + PROX_MRT + PROX_PARK +
PROX_MALL + PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
WITHIN_350M_CHILDCARE + WITHIN_350M_BUS + WITHIN_1KM_PRISCH,
data = train_data)
Residuals:
Min 1Q Median 3Q Max
-205193 -39120 -1930 36545 472355
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 107601.073 10601.261 10.150 < 2e-16 ***
floor_area_sqm 2780.698 90.579 30.699 < 2e-16 ***
storey_order 14299.298 339.115 42.167 < 2e-16 ***
remaining_lease_mths 344.490 4.592 75.027 < 2e-16 ***
PROX_CBD -16930.196 201.254 -84.124 < 2e-16 ***
PROX_ELDERLYCARE -14441.025 994.867 -14.516 < 2e-16 ***
PROX_HAWKER -19265.648 1273.597 -15.127 < 2e-16 ***
PROX_MRT -32564.272 1744.232 -18.670 < 2e-16 ***
PROX_PARK -5712.625 1483.885 -3.850 0.000119 ***
PROX_MALL -14717.388 2007.818 -7.330 2.47e-13 ***
PROX_SUPERMARKET -26881.938 4189.624 -6.416 1.46e-10 ***
WITHIN_350M_KINDERGARTEN 8520.472 632.812 13.464 < 2e-16 ***
WITHIN_350M_CHILDCARE -4510.650 354.015 -12.741 < 2e-16 ***
WITHIN_350M_BUS 813.493 222.574 3.655 0.000259 ***
WITHIN_1KM_PRISCH -8010.834 491.512 -16.298 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 61650 on 10320 degrees of freedom
Multiple R-squared: 0.7373, Adjusted R-squared: 0.737
F-statistic: 2069 on 14 and 10320 DF, p-value: < 2.2e-16
Can use this report for looking at generic trend, but don’t report these values (?)
GWR Methods
GWR Predictive model
First, need to change to spatial
<- as_Spatial(train_data)
train_date_sp <- as_Spatial(test_data) test_date_sp
Computing adaptive bandwidths
Refer to week 9 HOE
GWR Random Forest (using SpatialML)
Preparing coordinates data
# Extracting and save geometric data in coordinate files
<- st_coordinates(mdata)
coords <- st_coordinates(train_data)
coords_train <- st_coordinates(test_data) coords_test
write_rds(coords_train, "data/model/coords_train.rds")
write_rds(coords_test, "data/model/coords_test.rds")
<- read_rds("data/model/coords_train.rds")
coords_train <- read_rds("data/model/coords_test.rds") coords_test
Drop Geometry Field
<- train_data %>%
train_data st_drop_geometry()
Calibrate Random Forest
<- ranger(resale_price ~ floor_area_sqm +
rf + remaining_lease_mths +
storey_order + PROX_ELDERLYCARE + PROX_HAWKER +
PROX_CBD + PROX_PARK + PROX_MALL +
PROX_MRT + WITHIN_350M_KINDERGARTEN +
PROX_SUPERMARKET + WITHIN_350M_BUS +
WITHIN_350M_CHILDCARE
WITHIN_1KM_PRISCH,data=train_data)
print(rf)
Ranger result
Call:
ranger(resale_price ~ floor_area_sqm + storey_order + remaining_lease_mths + PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER + PROX_MRT + PROX_PARK + PROX_MALL + PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN + WITHIN_350M_CHILDCARE + WITHIN_350M_BUS + WITHIN_1KM_PRISCH, data = train_data)
Type: Regression
Number of trees: 500
Sample size: 10335
Number of independent variables: 14
Mtry: 3
Target node size: 5
Variable importance mode: none
Splitrule: variance
OOB prediction error (MSE): 735115316
R squared (OOB): 0.9491221
node size -> min is 5 per node, can increase if you want
OOB -> mean squared error, is different from the residual standard error in the OSL (is the square root of the standard error), they are not comparable. need to square root MSE if want to compare
Compared based on r squared is easier
GWR RF (Adaptive)
# gwRF_adaptive <- grf(resale_price ~ floor_area_sqm +
# storey_order + remaining_lease_mths +
# PROX_CBD + PROX_ELDERLYCARE + PROX_HAWKER +
# PROX_MRT + PROX_PARK + PROX_MALL +
# PROX_SUPERMARKET + WITHIN_350M_KINDERGARTEN +
# WITHIN_350M_CHILDCARE + WITHIN_350M_BUS +
# WITHIN_1KM_PRISCH,
# dframe=train_data,
# bw=55,
# kernel="adaptive",
# coords=coords_train)
# Save model
# write_rds(gwRF_adaptive, "data/model/gwRF_adaptive.rds")
<- read_rds("data/model/gwRF_adaptive.rds") gwRF_adaptive
# View the "importance" of each var (the weights)
<- as.data.frame(gwRF_adaptive$Global.Model$variable.importance) vi_df
Predicting using test data
Prep test data
<- cbind(test_data, coords_test) %>% st_drop_geometry() test_data
Predict on GWR RF adaptive
# gwRF_pred <- predict.grf(gwRF_adaptive,
# test_data,
# x.var.name="X",
# y.var.name="Y",
# local.w=1,
# global.w=0)
Ouput is a vector