1 Generalized Additive Model

There are two common implementations of GAMs in R. The older version (originally made for S-PLUS) is available as the ‘gam’ package by Hastie and Tibshirani. The newer version that we will use below is the ‘mgcv’ package from Simon Wood. The basic modeling procedure for both packages is similar (the function is gam for both; be wary of having both libraries loaded at the same time), but the behind-the-scenes computational approaches differ, as do the arguments for optimization and the model output. Expect the results to be slightly different when used with the same model structure on the same dataset.

1.1 GAM on Boston Housing dataset

library(MASS)
set.seed(1234)
sample_index <- sample(nrow(Boston),nrow(Boston)*0.70)
Boston_train <- Boston[sample_index,]
Boston_test <- Boston[-sample_index,]
str(Boston_train)

Model and plots

library(mgcv)

#create gam model
Boston_gam <- gam(medv ~ s(crim)+s(zn)+s(indus)+chas+s(nox)
                 +s(rm)+s(age)+s(dis)+rad+s(tax)+s(ptratio)
                 +s(black)+s(lstat),data=Boston_train)

summary(Boston_gam)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## medv ~ s(crim) + s(zn) + s(indus) + chas + s(nox) + s(rm) + s(age) + 
##     s(dis) + rad + s(tax) + s(ptratio) + s(black) + s(lstat)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  18.9672     1.4638  12.958   <2e-16 ***
## chas          1.0098     0.7104   1.421   0.1562    
## rad           0.3454     0.1505   2.296   0.0224 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##              edf Ref.df      F  p-value    
## s(crim)    4.543  5.488  7.436 6.38e-07 ***
## s(zn)      1.000  1.000  0.973 0.324644    
## s(indus)   6.701  7.629  4.124 0.000171 ***
## s(nox)     8.946  8.995 17.931  < 2e-16 ***
## s(rm)      7.792  8.605 19.944  < 2e-16 ***
## s(age)     2.415  3.058  1.302 0.282947    
## s(dis)     8.742  8.975  8.895 4.36e-12 ***
## s(tax)     6.122  7.145  7.091 7.01e-08 ***
## s(ptratio) 1.000  1.000 21.257 5.90e-06 ***
## s(black)   1.000  1.000  0.002 0.963662    
## s(lstat)   6.362  7.484 18.551  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.893   Deviance explained =   91%
## GCV = 10.713  Scale est. = 8.9693    n = 354

plot(Boston_gam, pages=1)

Model AIC/BIC and mean residual deviance

AIC(Boston_gam)
BIC(Boston_gam)
Boston_gam$deviance

In-sample fit performance

#in-sample mse using df 
Boston_gam.mse.train <- Boston_gam$dev/Boston_gam$df.residual 
#Average Sum of Squared Error
Boston_gam.mse.train <- Boston_gam$dev/nrow(Boston_train) 

#using the predict() function
pi <- predict(Boston_gam,Boston_train)
mean((pi - Boston_train$medv)^2)

## [1] 7.509325

out of sample performance

pi.out <- predict(Boston_gam,Boston_test)
mean((pi.out - Boston_test$medv)^2)

## [1] 14.04571

go to top

1.2 GAM on Bankruptcy dataset

Bank_data <- read.csv(file = "https://xiaoruizhu.github.io/Data-Mining-R/lecture/data/bankruptcy.csv", header=T)
# summary(Bank_data)

sample_index <- sample(nrow(Bank_data),nrow(Bank_data)*0.70)
Bank_train <- Bank_data[sample_index,]
Bank_test <- Bank_data[-sample_index,]

Model and plots

Bank_gam <- gam(DLRSN ~ s(R1)+s(R2)+s(R3)+s(R4)+
                  s(R5)+s(R6)+s(R7)+s(R8)+s(R9)+s(R10), data=Bank_train)

summary(Bank_gam)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## DLRSN ~ s(R1) + s(R2) + s(R3) + s(R4) + s(R5) + s(R6) + s(R7) + 
##     s(R8) + s(R9) + s(R10)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.139028   0.004601   30.21   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df      F  p-value    
## s(R1)  1.000  1.000  0.553   0.4571    
## s(R2)  4.227  5.204 25.941  < 2e-16 ***
## s(R3)  2.125  2.723  8.647 5.27e-05 ***
## s(R4)  1.000  1.000  5.893   0.0153 *  
## s(R5)  6.630  7.743  2.379   0.0140 *  
## s(R6)  4.501  5.568 22.453  < 2e-16 ***
## s(R7)  1.000  1.000  0.788   0.3749    
## s(R8)  6.564  7.670  9.507 1.68e-12 ***
## s(R9)  4.046  5.069 14.335 5.31e-14 ***
## s(R10) 5.621  6.770 50.329  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.327   Deviance explained = 33.4%
## GCV = 0.08137  Scale est. = 0.080564  n = 3805

plot(Bank_gam, pages=1)

In-sample fit performance

The in-sample confusion matrix:

pcut_gam <- 1/36
prob_gam_in <-predict(Bank_gam, Bank_train, type="response")
pred_gam_in <- (prob_gam_in>=pcut_gam)*1
table(Bank_train$DLRSN, pred_gam_in, dnn=c("Observed","Predicted"))

##         Predicted
## Observed    0    1
##        0 1335 1941
##        1   17  512

The in-sample asymmetric cost is another thing you can check:

bankcost <- function(r, pi){
  weight1 = 35
  weight0 = 1
  pcut <- weight0/(weight0+weight1)
  c1 = (r==1)&(pi<pcut) #logical vector - true if actual 1 but predict 0
  c0 = (r==0)&(pi>pcut) #logical vector - true if actual 0 but predict 1
  return(mean(weight1*c1+weight0*c0))
}
bankcost(Bank_train$DLRSN, pred_gam_in)

## [1] 0.6664915

ROC Curve:

library(ROCR)
pred <- prediction(c(prob_gam_in), Bank_train$DLRSN)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)

#Get the AUC
unlist(slot(performance(pred, "auc"), "y.values"))

## [1] 0.8931203

Model AIC/BIC and mean residual deviance

AIC(Bank_gam)

## [1] 1253.97

BIC(Bank_gam)

## [1] 1495.704

#in-sample mean residual deviance using df 
Bank_gam$dev/Bank_gam$df.residual

## [1] 0.08056376

Out-of-sample fit performance

The out-of-sample confusion matrix:

prob_gam_out <- predict(Bank_gam, Bank_test, type="response")
pred_gam_out <- (prob_gam_out>=pcut_gam)*1
table(Bank_test$DLRSN, pred_gam_out, dnn=c("Observed","Predicted"))

##         Predicted
## Observed   0   1
##        0 586 798
##        1  12 235

The asymmetric cost is:

bankcost(Bank_test$DLRSN, pred_gam_out)

## [1] 0.7467811

ROC Curve:

pred <- prediction(c(prob_gam_out), Bank_test$DLRSN)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)

#Get the AUC
unlist(slot(performance(pred, "auc"), "y.values"))

## [1] 0.8753335

go to top

1.3 GAM on Credit Card Default Data

The Credit Card Default Data has 10800 observations and 23 predictive variables. The details of the data can be found at http://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients. Think about what kind of factors could affect people to fail to pay their credit balance.

credit_data <- read.csv(file = "https://xiaoruizhu.github.io/Data-Mining-R/lecture/data/credit_default.csv", header=T)

# rename
library(dplyr)
credit_data<- rename(credit_data, default=default.payment.next.month)
# convert categorical data to factor
credit_data$SEX <- as.factor(credit_data$SEX)
credit_data$EDUCATION <- as.factor(credit_data$EDUCATION)
credit_data$MARRIAGE <- as.factor(credit_data$MARRIAGE)

Now split the data 90/10 as training/testing datasets:

index <- sample(nrow(credit_data),nrow(credit_data)*0.90)
credit_train = credit_data[index,]
credit_test = credit_data[-index,]

Some of these predictors are categorical variables and they will enter the gam() model as partially linear terms. We only add flexible s() function to those continuous predictor variables such as LIMIT_BAL, AGE etc. Here we will demonstrate using five continuous variables as smooth terms and three categorical variables SEX, EDUCATION, and MARRIAGE as partially linear terms to save the space of summary output.

## Create a formula for a model with a large number of variables:
gam_formula <- as.formula("default~s(LIMIT_BAL)+s(AGE)+s(PAY_0)+s(BILL_AMT1)+s(PAY_AMT1)+SEX+EDUCATION+MARRIAGE")

credit_gam <- gam(formula = gam_formula, family=binomial, data=credit_train);
summary(credit_gam)

## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## default ~ s(LIMIT_BAL) + s(AGE) + s(PAY_0) + s(BILL_AMT1) + s(PAY_AMT1) + 
##     SEX + EDUCATION + MARRIAGE
## 
## Parametric coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.217750   0.067325 -18.088  < 2e-16 ***
## SEX2        -0.175662   0.053552  -3.280  0.00104 ** 
## EDUCATION2  -0.046877   0.061423  -0.763  0.44535    
## EDUCATION3  -0.218309   0.083468  -2.615  0.00891 ** 
## EDUCATION4  -1.204359   0.306518  -3.929 8.52e-05 ***
## MARRIAGE2   -0.117430   0.060287  -1.948  0.05143 .  
## MARRIAGE3   -0.007125   0.234567  -0.030  0.97577    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df   Chi.sq  p-value    
## s(LIMIT_BAL) 5.382  6.283  133.792  < 2e-16 ***
## s(AGE)       1.330  1.595    3.418 0.094709 .  
## s(PAY_0)     6.714  7.454 1207.921  < 2e-16 ***
## s(BILL_AMT1) 3.827  4.780   47.780  4.1e-09 ***
## s(PAY_AMT1)  5.199  6.402   26.454 0.000282 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.198   Deviance explained = 16.8%
## UBRE = -0.11427  Scale est. = 1         n = 10800

plot(credit_gam, shade=TRUE, seWithMean=TRUE, scale=0, pages = 1)

The function vis.gam() can visualize the nonlinear relationship between two variables and the linear predictor in a 3D space as follows:

# vis.gam(credit_gam)
vis.gam(credit_gam, view=c("LIMIT_BAL","AGE"), theta= 140) # different view

1.3.1 In-sample fit performance

In order to see the in-sample fit performance, you may look into the confusion matrix by using commands as following. We assume the cut-off probability as 1/6.

pcut_gam <- 1/6
prob_gam_in <-predict(credit_gam,credit_train,type="response")
pred_gam_in <- (prob_gam_in>=pcut_gam)*1
table(credit_train$default, pred_gam_in,dnn=c("Observed","Predicted"))

##         Predicted
## Observed    0    1
##        0 5564 2841
##        1  694 1701

The asymmetric cost with 5:1 cost ratio is:

creditcost <- function(r, pi){
  weight1 = 5
  weight0 = 1
  pcut <- weight0/(weight0+weight1)
  c1 = (r==1)&(pi<pcut) #logical vector - true if actual 1 but predict 0
  c0 = (r==0)&(pi>pcut) #logical vector - true if actual 0 but predict 1
  return(mean(weight1*c1+weight0*c0))
}
creditcost(credit_train$default, pred_gam_in)

## [1] 0.5843519

Training model AIC, BIC, and mean residual deviance:

AIC(credit_gam)

## [1] 9565.864

BIC(credit_gam)

## [1] 9780.494

# credit_gam$deviance

ROC Curve:

library(ROCR)
pred <- prediction(predictions = c(prob_gam_in), labels = credit_train$default)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)

#Get the AUC
unlist(slot(performance(pred, "auc"), "y.values"))

## [1] 0.7614636

1.3.2 Out-of-sample fit performance

pcut <- 1/6
prob_gam_out <- predict(credit_gam, credit_test,type="response")
pred_gam_out <- (prob_gam_out>=pcut)*1
table(credit_test$default, pred_gam_out,dnn=c("Observed","Predicted"))

##         Predicted
## Observed   0   1
##        0 633 330
##        1  56 181

The asymmetric cost is

creditcost(credit_test$default, pred_gam_out)

## [1] 0.5083333

ROC Curve:

pred <- prediction(predictions = c(prob_gam_out), labels = credit_test$default)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)

#Get the AUC
unlist(slot(performance(pred, "auc"), "y.values"))

## [1] 0.7919257

go to top

1.4 GAM using the “Motorcycle” dataset

Finally, we can also apply gam() for a univariate smoothing on the motorcycle data.

library(MASS)
data('mcycle')
str(mcycle)
summary(mcycle)

# Rename the variables for ease of usage
Y <- mcycle$accel
X <- mcycle$times

#Scatterplot
plot(Y~X, xlab="time",ylab="Acceleration", main="Scatterplot of Acceleration against Time")

library(mgcv)
s_gam <- gam(Y ~ s(X),data=mcycle)
summary(s_gam)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Y ~ s(X)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -25.546      1.951   -13.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##        edf Ref.df     F p-value    
## s(X) 8.693  8.972 53.52  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.783   Deviance explained = 79.8%
## GCV = 545.78  Scale est. = 506       n = 133

#plot the model
plot(s_gam, residuals = TRUE, pch = 1)

go to top