1 A Case Study on the Price of Gasoline

1.1 Do you believe the ARIMA model can do the magic?

1.1.1 YES!

Give me your reasons.

1.1.2 NO!

Why?

1.2 Import the data

Gas_prices <- readxl::read_xlsx(path = "data/Gas_prices_1.xlsx")
colnames(Gas_prices)

[1] “Date” “Year” “Month” “Unleaded”
[5] “Crude_price” “SP500” “PDI” “CPI”
[9] “RR_sales” “Unemp” “Housing” “Demand”
[13] “Production” “Imports” “Stocks” “Time”
[17] “L1_Unleaded” “L1_Crude_price” “L1_SP500” “L1_PDI”
[21] “L1_CPI” “L1_RR_sales” “L1_Unemp” “L1_Housing”
[25] “L1_Demand” “L1_Production” “L1_Imports” “L1_Stocks”
[29] “Recession”

if (!require("forecast")){install.packages("forecast")}

1.3 Scatter plot between Unleaded and predictors

library("PerformanceAnalytics")
my_data1 <- Gas_prices[,-c(1:3,10:29)]
chart.Correlation(R = my_data1, histogram=TRUE, pch=19)

my_data2 <- Gas_prices[,-c(1:3,5:9, 16:29)]
chart.Correlation(R = my_data2, histogram=TRUE, pch=19)

my_data3 <- Gas_prices[,-c(1:3,5:16,23:29)]
chart.Correlation(R = my_data3, histogram=TRUE, pch=19)

my_data4 <- Gas_prices[,-c(1:3,5:22, 29)]
chart.Correlation(R = my_data4, histogram=TRUE, pch=19)

1.4 Statistical Test of Correlations

if (!require("Hmisc")){install.packages("Hmisc")}
library("Hmisc")
res1 <- rcorr(as.matrix(my_data1))
round(res1$r, digits = 2)
##             Unleaded Crude_price SP500  PDI  CPI RR_sales
## Unleaded        1.00        0.98  0.53 0.87 0.90     0.73
## Crude_price     0.98        1.00  0.47 0.83 0.86     0.68
## SP500           0.53        0.47  1.00 0.69 0.67     0.79
## PDI             0.87        0.83  0.69 1.00 1.00     0.86
## CPI             0.90        0.86  0.67 1.00 1.00     0.83
## RR_sales        0.73        0.68  0.79 0.86 0.83     1.00
round(res1$P, 2)
##             Unleaded Crude_price SP500 PDI CPI RR_sales
## Unleaded          NA           0     0   0   0        0
## Crude_price        0          NA     0   0   0        0
## SP500              0           0    NA   0   0        0
## PDI                0           0     0  NA   0        0
## CPI                0           0     0   0  NA        0
## RR_sales           0           0     0   0   0       NA
res2 <- rcorr(as.matrix(my_data2))
round(res2$r, 2)
##            Unleaded Unemp Housing Demand Production Imports Stocks
## Unleaded       1.00  0.56   -0.64  -0.53       0.15   -0.03   0.68
## Unemp          0.56  1.00   -0.78  -0.39      -0.02   -0.12   0.54
## Housing       -0.64 -0.78    1.00   0.56      -0.22    0.32  -0.61
## Demand        -0.53 -0.39    0.56   1.00      -0.82    0.72  -0.60
## Production     0.15 -0.02   -0.22  -0.82       1.00   -0.69   0.42
## Imports       -0.03 -0.12    0.32   0.72      -0.69    1.00  -0.17
## Stocks         0.68  0.54   -0.61  -0.60       0.42   -0.17   1.00
round(res2$P, 2)
##            Unleaded Unemp Housing Demand Production Imports Stocks
## Unleaded         NA  0.00       0      0       0.02    0.61   0.00
## Unemp          0.00    NA       0      0       0.76    0.07   0.00
## Housing        0.00  0.00      NA      0       0.00    0.00   0.00
## Demand         0.00  0.00       0     NA       0.00    0.00   0.00
## Production     0.02  0.76       0      0         NA    0.00   0.00
## Imports        0.61  0.07       0      0       0.00      NA   0.01
## Stocks         0.00  0.00       0      0       0.00    0.01     NA
res3 <- rcorr(as.matrix(my_data3))
round(res3$r, 2)
##                Unleaded L1_Unleaded L1_Crude_price L1_SP500 L1_PDI L1_CPI
## Unleaded           1.00        0.99           0.97     0.53   0.87   0.89
## L1_Unleaded        0.99        1.00           0.98     0.53   0.88   0.90
## L1_Crude_price     0.97        0.98           1.00     0.48   0.84   0.87
## L1_SP500           0.53        0.53           0.48     1.00   0.69   0.66
## L1_PDI             0.87        0.88           0.84     0.69   1.00   1.00
## L1_CPI             0.89        0.90           0.87     0.66   1.00   1.00
## L1_RR_sales        0.74        0.74           0.69     0.79   0.86   0.83
##                L1_RR_sales
## Unleaded              0.74
## L1_Unleaded           0.74
## L1_Crude_price        0.69
## L1_SP500              0.79
## L1_PDI                0.86
## L1_CPI                0.83
## L1_RR_sales           1.00
round(res3$P, 2)
##                Unleaded L1_Unleaded L1_Crude_price L1_SP500 L1_PDI L1_CPI
## Unleaded             NA           0              0        0      0      0
## L1_Unleaded           0          NA              0        0      0      0
## L1_Crude_price        0           0             NA        0      0      0
## L1_SP500              0           0              0       NA      0      0
## L1_PDI                0           0              0        0     NA      0
## L1_CPI                0           0              0        0      0     NA
## L1_RR_sales           0           0              0        0      0      0
##                L1_RR_sales
## Unleaded                 0
## L1_Unleaded              0
## L1_Crude_price           0
## L1_SP500                 0
## L1_PDI                   0
## L1_CPI                   0
## L1_RR_sales             NA
res4 <- rcorr(as.matrix(my_data4))
round(res4$r, 2)
##               Unleaded L1_Unemp L1_Housing L1_Demand L1_Production
## Unleaded          1.00     0.57      -0.63     -0.52          0.14
## L1_Unemp          0.57     1.00      -0.78     -0.39         -0.01
## L1_Housing       -0.63    -0.78       1.00      0.57         -0.22
## L1_Demand        -0.52    -0.39       0.57      1.00         -0.81
## L1_Production     0.14    -0.01      -0.22     -0.81          1.00
## L1_Imports       -0.03    -0.12       0.32      0.72         -0.69
## L1_Stocks         0.68     0.56      -0.62     -0.59          0.40
##               L1_Imports L1_Stocks
## Unleaded           -0.03      0.68
## L1_Unemp           -0.12      0.56
## L1_Housing          0.32     -0.62
## L1_Demand           0.72     -0.59
## L1_Production      -0.69      0.40
## L1_Imports          1.00     -0.16
## L1_Stocks          -0.16      1.00
round(res4$P, 2)
##               Unleaded L1_Unemp L1_Housing L1_Demand L1_Production
## Unleaded            NA     0.00          0         0          0.04
## L1_Unemp          0.00       NA          0         0          0.83
## L1_Housing        0.00     0.00         NA         0          0.00
## L1_Demand         0.00     0.00          0        NA          0.00
## L1_Production     0.04     0.83          0         0            NA
## L1_Imports        0.65     0.06          0         0          0.00
## L1_Stocks         0.00     0.00          0         0          0.00
##               L1_Imports L1_Stocks
## Unleaded            0.65      0.00
## L1_Unemp            0.06      0.00
## L1_Housing          0.00      0.00
## L1_Demand           0.00      0.00
## L1_Production       0.00      0.00
## L1_Imports            NA      0.01
## L1_Stocks           0.01        NA

1.5 The Multiple Regression Model

  1. Possible multiple regression model 1
mod_mul1 <- lm(formula = Unleaded ~ Crude_price + CPI + Demand, 
              data = Gas_prices)
summary(mod_mul1)
## 
## Call:
## lm(formula = Unleaded ~ Crude_price + CPI + Demand, data = Gas_prices)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -31.549 -10.814  -1.954   7.357  65.137 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -48.26392   19.52878  -2.471   0.0142 *  
## Crude_price   2.30659    0.06870  33.575  < 2e-16 ***
## CPI           0.76253    0.09357   8.149 2.14e-14 ***
## Demand       -0.14675    0.14335  -1.024   0.3070    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.58 on 236 degrees of freedom
## Multiple R-squared:  0.9681, Adjusted R-squared:  0.9677 
## F-statistic:  2386 on 3 and 236 DF,  p-value: < 2.2e-16
  1. Possible multiple regression model 2
mod_mul2 <- lm(formula = Unleaded ~ L1_Crude_price + L1_CPI + L1_RR_sales + Stocks, 
              data = Gas_prices)
summary(mod_mul2)
## 
## Call:
## lm(formula = Unleaded ~ L1_Crude_price + L1_CPI + L1_RR_sales + 
##     Stocks, data = Gas_prices)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -68.662  -9.205  -0.846   9.140  59.491 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -9.059e+01  2.666e+01  -3.398 0.000797 ***
## L1_Crude_price  2.406e+00  8.726e-02  27.574  < 2e-16 ***
## L1_CPI          4.017e-01  1.907e-01   2.106 0.036270 *  
## L1_RR_sales     4.139e-04  1.685e-04   2.456 0.014772 *  
## Stocks          2.059e-02  2.297e-02   0.896 0.370981    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.39 on 234 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.9608, Adjusted R-squared:  0.9601 
## F-statistic:  1434 on 4 and 234 DF,  p-value: < 2.2e-16

go to top

2 Baseball Salaries

2.1 Import the data

Baseball Salaries Dataset

Baseball <- readxl::read_xlsx(path = "data/Baseball.xlsx")
head(Baseball)
if (!require("forecast")){install.packages("forecast")}

2.2 Scatter plot between Sallary and predictors

library("PerformanceAnalytics")
chart.Correlation(R = Baseball, histogram=TRUE, pch=19)

2.3 Statistical Test of Correlations

if (!require("Hmisc")){install.packages("Hmisc")}
library("Hmisc")
res1 <- rcorr(as.matrix(Baseball))
round(res1$r, digits = 2)
##                 Salary ($000s) Years in Majors Career ERA Innings Pitched
## Salary ($000s)            1.00            0.53      -0.34            0.27
## Years in Majors           0.53            1.00      -0.22            0.11
## Career ERA               -0.34           -0.22       1.00            0.09
## Innings Pitched           0.27            0.11       0.09            1.00
## Career Wins               0.51            0.89      -0.21            0.33
## Career Losses             0.49            0.91      -0.14            0.30
##                 Career Wins Career Losses
## Salary ($000s)         0.51          0.49
## Years in Majors        0.89          0.91
## Career ERA            -0.21         -0.14
## Innings Pitched        0.33          0.30
## Career Wins            1.00          0.97
## Career Losses          0.97          1.00
round(res1$P, 2)
##                 Salary ($000s) Years in Majors Career ERA Innings Pitched
## Salary ($000s)              NA            0.00       0.00            0.00
## Years in Majors              0              NA       0.00            0.16
## Career ERA                   0            0.00         NA            0.25
## Innings Pitched              0            0.16       0.25              NA
## Career Wins                  0            0.00       0.00            0.00
## Career Losses                0            0.00       0.07            0.00
##                 Career Wins Career Losses
## Salary ($000s)            0          0.00
## Years in Majors           0          0.00
## Career ERA                0          0.07
## Innings Pitched           0          0.00
## Career Wins              NA          0.00
## Career Losses             0            NA

2.4 The Multiple Regression Model

  1. Possible multiple regression model 1
mod_mul1 <- lm(formula = `Salary ($000s)` ~ ., 
              data = Baseball)
summary(mod_mul1)
## 
## Call:
## lm(formula = `Salary ($000s)` ~ ., data = Baseball)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -957.12 -179.37  -30.94  144.76 1126.91 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        547.3741   167.1927   3.274 0.001285 ** 
## `Years in Majors`   53.6463    13.6215   3.938 0.000120 ***
## `Career ERA`      -152.3702    39.8138  -3.827 0.000182 ***
## `Innings Pitched`    1.7112     0.4207   4.067 7.27e-05 ***
## `Career Wins`       -0.5904     1.8456  -0.320 0.749458    
## `Career Losses`     -1.1234     2.4000  -0.468 0.640322    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 292.8 on 170 degrees of freedom
## Multiple R-squared:  0.3981, Adjusted R-squared:  0.3804 
## F-statistic: 22.49 on 5 and 170 DF,  p-value: < 2.2e-16
qqnorm(mod_mul1$residuals)
qqline(mod_mul1$residuals)

hist(mod_mul1$residuals)

  1. Possible multiple regression model 2
mod_mul2 <- lm(formula = `Salary ($000s)` ~ `Years in Majors` + `Career ERA` + `Innings Pitched`, 
              data = Baseball)
summary(mod_mul2)
## 
## Call:
## lm(formula = `Salary ($000s)` ~ `Years in Majors` + `Career ERA` + 
##     `Innings Pitched`, data = Baseball)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1050.61  -189.86   -25.38   146.24  1154.01 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        620.2179   150.1692   4.130 5.65e-05 ***
## `Years in Majors`   36.8496     5.0231   7.336 8.31e-12 ***
## `Career ERA`      -154.2076    36.8781  -4.182 4.60e-05 ***
## `Innings Pitched`    1.4156     0.3556   3.981 0.000101 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 292.6 on 172 degrees of freedom
## Multiple R-squared:  0.3917, Adjusted R-squared:  0.3811 
## F-statistic: 36.91 on 3 and 172 DF,  p-value: < 2.2e-16
AIC(mod_mul2)
## [1] 2504.412
qqnorm(mod_mul2$residuals)
qqline(mod_mul2$residuals)

hist(mod_mul2$residuals)

plot(mod_mul2$residuals ~ Baseball$`Years in Majors`)

plot(mod_mul2$residuals ~ Baseball$`Salary ($000s)`)

2.5 Regression Diagnostics

par(mfrow=c(1,2))
qqnorm(mod_mul1$residuals, main = "Model 1 Q-Q Plot")
qqline(mod_mul1$residuals)
qqnorm(mod_mul2$residuals, main = "Model 2 Q-Q Plot")
qqline(mod_mul2$residuals)

dev.off()
## null device 
##           1
library(car)
outlierTest(mod_mul2) # Bonferonni p-value for most extreme obs
##      rstudent unadjusted p-value Bonferonni p
## 174  4.148382         5.2665e-05    0.0092691
## 38  -3.879754         1.4901e-04    0.0262260
## 176  3.774558         2.2082e-04    0.0388640
## 175  3.719951         2.6997e-04    0.0475150
qqPlot(mod_mul2, main="QQ Plot") #qq plot for studentized resid

## [1]  38 174
leveragePlots(mod_mul2) # leverage plots

go to top