Give me your reasons.
Why?
Gas_prices <- readxl::read_xlsx(path = "data/Gas_prices_1.xlsx")
colnames(Gas_prices)
[1] “Date” “Year” “Month” “Unleaded”
[5] “Crude_price” “SP500” “PDI” “CPI”
[9] “RR_sales” “Unemp” “Housing” “Demand”
[13] “Production” “Imports” “Stocks” “Time”
[17] “L1_Unleaded” “L1_Crude_price” “L1_SP500” “L1_PDI”
[21] “L1_CPI” “L1_RR_sales” “L1_Unemp” “L1_Housing”
[25] “L1_Demand” “L1_Production” “L1_Imports” “L1_Stocks”
[29] “Recession”
if (!require("forecast")){install.packages("forecast")}
library("PerformanceAnalytics")
my_data1 <- Gas_prices[,-c(1:3,10:29)]
chart.Correlation(R = my_data1, histogram=TRUE, pch=19)
my_data2 <- Gas_prices[,-c(1:3,5:9, 16:29)]
chart.Correlation(R = my_data2, histogram=TRUE, pch=19)
my_data3 <- Gas_prices[,-c(1:3,5:16,23:29)]
chart.Correlation(R = my_data3, histogram=TRUE, pch=19)
my_data4 <- Gas_prices[,-c(1:3,5:22, 29)]
chart.Correlation(R = my_data4, histogram=TRUE, pch=19)
if (!require("Hmisc")){install.packages("Hmisc")}
library("Hmisc")
res1 <- rcorr(as.matrix(my_data1))
round(res1$r, digits = 2)
## Unleaded Crude_price SP500 PDI CPI RR_sales
## Unleaded 1.00 0.98 0.53 0.87 0.90 0.73
## Crude_price 0.98 1.00 0.47 0.83 0.86 0.68
## SP500 0.53 0.47 1.00 0.69 0.67 0.79
## PDI 0.87 0.83 0.69 1.00 1.00 0.86
## CPI 0.90 0.86 0.67 1.00 1.00 0.83
## RR_sales 0.73 0.68 0.79 0.86 0.83 1.00
round(res1$P, 2)
## Unleaded Crude_price SP500 PDI CPI RR_sales
## Unleaded NA 0 0 0 0 0
## Crude_price 0 NA 0 0 0 0
## SP500 0 0 NA 0 0 0
## PDI 0 0 0 NA 0 0
## CPI 0 0 0 0 NA 0
## RR_sales 0 0 0 0 0 NA
res2 <- rcorr(as.matrix(my_data2))
round(res2$r, 2)
## Unleaded Unemp Housing Demand Production Imports Stocks
## Unleaded 1.00 0.56 -0.64 -0.53 0.15 -0.03 0.68
## Unemp 0.56 1.00 -0.78 -0.39 -0.02 -0.12 0.54
## Housing -0.64 -0.78 1.00 0.56 -0.22 0.32 -0.61
## Demand -0.53 -0.39 0.56 1.00 -0.82 0.72 -0.60
## Production 0.15 -0.02 -0.22 -0.82 1.00 -0.69 0.42
## Imports -0.03 -0.12 0.32 0.72 -0.69 1.00 -0.17
## Stocks 0.68 0.54 -0.61 -0.60 0.42 -0.17 1.00
round(res2$P, 2)
## Unleaded Unemp Housing Demand Production Imports Stocks
## Unleaded NA 0.00 0 0 0.02 0.61 0.00
## Unemp 0.00 NA 0 0 0.76 0.07 0.00
## Housing 0.00 0.00 NA 0 0.00 0.00 0.00
## Demand 0.00 0.00 0 NA 0.00 0.00 0.00
## Production 0.02 0.76 0 0 NA 0.00 0.00
## Imports 0.61 0.07 0 0 0.00 NA 0.01
## Stocks 0.00 0.00 0 0 0.00 0.01 NA
res3 <- rcorr(as.matrix(my_data3))
round(res3$r, 2)
## Unleaded L1_Unleaded L1_Crude_price L1_SP500 L1_PDI L1_CPI
## Unleaded 1.00 0.99 0.97 0.53 0.87 0.89
## L1_Unleaded 0.99 1.00 0.98 0.53 0.88 0.90
## L1_Crude_price 0.97 0.98 1.00 0.48 0.84 0.87
## L1_SP500 0.53 0.53 0.48 1.00 0.69 0.66
## L1_PDI 0.87 0.88 0.84 0.69 1.00 1.00
## L1_CPI 0.89 0.90 0.87 0.66 1.00 1.00
## L1_RR_sales 0.74 0.74 0.69 0.79 0.86 0.83
## L1_RR_sales
## Unleaded 0.74
## L1_Unleaded 0.74
## L1_Crude_price 0.69
## L1_SP500 0.79
## L1_PDI 0.86
## L1_CPI 0.83
## L1_RR_sales 1.00
round(res3$P, 2)
## Unleaded L1_Unleaded L1_Crude_price L1_SP500 L1_PDI L1_CPI
## Unleaded NA 0 0 0 0 0
## L1_Unleaded 0 NA 0 0 0 0
## L1_Crude_price 0 0 NA 0 0 0
## L1_SP500 0 0 0 NA 0 0
## L1_PDI 0 0 0 0 NA 0
## L1_CPI 0 0 0 0 0 NA
## L1_RR_sales 0 0 0 0 0 0
## L1_RR_sales
## Unleaded 0
## L1_Unleaded 0
## L1_Crude_price 0
## L1_SP500 0
## L1_PDI 0
## L1_CPI 0
## L1_RR_sales NA
res4 <- rcorr(as.matrix(my_data4))
round(res4$r, 2)
## Unleaded L1_Unemp L1_Housing L1_Demand L1_Production
## Unleaded 1.00 0.57 -0.63 -0.52 0.14
## L1_Unemp 0.57 1.00 -0.78 -0.39 -0.01
## L1_Housing -0.63 -0.78 1.00 0.57 -0.22
## L1_Demand -0.52 -0.39 0.57 1.00 -0.81
## L1_Production 0.14 -0.01 -0.22 -0.81 1.00
## L1_Imports -0.03 -0.12 0.32 0.72 -0.69
## L1_Stocks 0.68 0.56 -0.62 -0.59 0.40
## L1_Imports L1_Stocks
## Unleaded -0.03 0.68
## L1_Unemp -0.12 0.56
## L1_Housing 0.32 -0.62
## L1_Demand 0.72 -0.59
## L1_Production -0.69 0.40
## L1_Imports 1.00 -0.16
## L1_Stocks -0.16 1.00
round(res4$P, 2)
## Unleaded L1_Unemp L1_Housing L1_Demand L1_Production
## Unleaded NA 0.00 0 0 0.04
## L1_Unemp 0.00 NA 0 0 0.83
## L1_Housing 0.00 0.00 NA 0 0.00
## L1_Demand 0.00 0.00 0 NA 0.00
## L1_Production 0.04 0.83 0 0 NA
## L1_Imports 0.65 0.06 0 0 0.00
## L1_Stocks 0.00 0.00 0 0 0.00
## L1_Imports L1_Stocks
## Unleaded 0.65 0.00
## L1_Unemp 0.06 0.00
## L1_Housing 0.00 0.00
## L1_Demand 0.00 0.00
## L1_Production 0.00 0.00
## L1_Imports NA 0.01
## L1_Stocks 0.01 NA
mod_mul1 <- lm(formula = Unleaded ~ Crude_price + CPI + Demand,
data = Gas_prices)
summary(mod_mul1)
##
## Call:
## lm(formula = Unleaded ~ Crude_price + CPI + Demand, data = Gas_prices)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.549 -10.814 -1.954 7.357 65.137
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -48.26392 19.52878 -2.471 0.0142 *
## Crude_price 2.30659 0.06870 33.575 < 2e-16 ***
## CPI 0.76253 0.09357 8.149 2.14e-14 ***
## Demand -0.14675 0.14335 -1.024 0.3070
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.58 on 236 degrees of freedom
## Multiple R-squared: 0.9681, Adjusted R-squared: 0.9677
## F-statistic: 2386 on 3 and 236 DF, p-value: < 2.2e-16
mod_mul2 <- lm(formula = Unleaded ~ L1_Crude_price + L1_CPI + L1_RR_sales + Stocks,
data = Gas_prices)
summary(mod_mul2)
##
## Call:
## lm(formula = Unleaded ~ L1_Crude_price + L1_CPI + L1_RR_sales +
## Stocks, data = Gas_prices)
##
## Residuals:
## Min 1Q Median 3Q Max
## -68.662 -9.205 -0.846 9.140 59.491
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.059e+01 2.666e+01 -3.398 0.000797 ***
## L1_Crude_price 2.406e+00 8.726e-02 27.574 < 2e-16 ***
## L1_CPI 4.017e-01 1.907e-01 2.106 0.036270 *
## L1_RR_sales 4.139e-04 1.685e-04 2.456 0.014772 *
## Stocks 2.059e-02 2.297e-02 0.896 0.370981
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.39 on 234 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.9608, Adjusted R-squared: 0.9601
## F-statistic: 1434 on 4 and 234 DF, p-value: < 2.2e-16
Baseball <- readxl::read_xlsx(path = "data/Baseball.xlsx")
head(Baseball)
if (!require("forecast")){install.packages("forecast")}
library("PerformanceAnalytics")
chart.Correlation(R = Baseball, histogram=TRUE, pch=19)
if (!require("Hmisc")){install.packages("Hmisc")}
library("Hmisc")
res1 <- rcorr(as.matrix(Baseball))
round(res1$r, digits = 2)
## Salary ($000s) Years in Majors Career ERA Innings Pitched
## Salary ($000s) 1.00 0.53 -0.34 0.27
## Years in Majors 0.53 1.00 -0.22 0.11
## Career ERA -0.34 -0.22 1.00 0.09
## Innings Pitched 0.27 0.11 0.09 1.00
## Career Wins 0.51 0.89 -0.21 0.33
## Career Losses 0.49 0.91 -0.14 0.30
## Career Wins Career Losses
## Salary ($000s) 0.51 0.49
## Years in Majors 0.89 0.91
## Career ERA -0.21 -0.14
## Innings Pitched 0.33 0.30
## Career Wins 1.00 0.97
## Career Losses 0.97 1.00
round(res1$P, 2)
## Salary ($000s) Years in Majors Career ERA Innings Pitched
## Salary ($000s) NA 0.00 0.00 0.00
## Years in Majors 0 NA 0.00 0.16
## Career ERA 0 0.00 NA 0.25
## Innings Pitched 0 0.16 0.25 NA
## Career Wins 0 0.00 0.00 0.00
## Career Losses 0 0.00 0.07 0.00
## Career Wins Career Losses
## Salary ($000s) 0 0.00
## Years in Majors 0 0.00
## Career ERA 0 0.07
## Innings Pitched 0 0.00
## Career Wins NA 0.00
## Career Losses 0 NA
mod_mul1 <- lm(formula = `Salary ($000s)` ~ .,
data = Baseball)
summary(mod_mul1)
##
## Call:
## lm(formula = `Salary ($000s)` ~ ., data = Baseball)
##
## Residuals:
## Min 1Q Median 3Q Max
## -957.12 -179.37 -30.94 144.76 1126.91
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 547.3741 167.1927 3.274 0.001285 **
## `Years in Majors` 53.6463 13.6215 3.938 0.000120 ***
## `Career ERA` -152.3702 39.8138 -3.827 0.000182 ***
## `Innings Pitched` 1.7112 0.4207 4.067 7.27e-05 ***
## `Career Wins` -0.5904 1.8456 -0.320 0.749458
## `Career Losses` -1.1234 2.4000 -0.468 0.640322
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 292.8 on 170 degrees of freedom
## Multiple R-squared: 0.3981, Adjusted R-squared: 0.3804
## F-statistic: 22.49 on 5 and 170 DF, p-value: < 2.2e-16
qqnorm(mod_mul1$residuals)
qqline(mod_mul1$residuals)
hist(mod_mul1$residuals)
mod_mul2 <- lm(formula = `Salary ($000s)` ~ `Years in Majors` + `Career ERA` + `Innings Pitched`,
data = Baseball)
summary(mod_mul2)
##
## Call:
## lm(formula = `Salary ($000s)` ~ `Years in Majors` + `Career ERA` +
## `Innings Pitched`, data = Baseball)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1050.61 -189.86 -25.38 146.24 1154.01
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 620.2179 150.1692 4.130 5.65e-05 ***
## `Years in Majors` 36.8496 5.0231 7.336 8.31e-12 ***
## `Career ERA` -154.2076 36.8781 -4.182 4.60e-05 ***
## `Innings Pitched` 1.4156 0.3556 3.981 0.000101 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 292.6 on 172 degrees of freedom
## Multiple R-squared: 0.3917, Adjusted R-squared: 0.3811
## F-statistic: 36.91 on 3 and 172 DF, p-value: < 2.2e-16
AIC(mod_mul2)
## [1] 2504.412
qqnorm(mod_mul2$residuals)
qqline(mod_mul2$residuals)
hist(mod_mul2$residuals)
plot(mod_mul2$residuals ~ Baseball$`Years in Majors`)
plot(mod_mul2$residuals ~ Baseball$`Salary ($000s)`)
par(mfrow=c(1,2))
qqnorm(mod_mul1$residuals, main = "Model 1 Q-Q Plot")
qqline(mod_mul1$residuals)
qqnorm(mod_mul2$residuals, main = "Model 2 Q-Q Plot")
qqline(mod_mul2$residuals)
dev.off()
## null device
## 1
library(car)
outlierTest(mod_mul2) # Bonferonni p-value for most extreme obs
## rstudent unadjusted p-value Bonferonni p
## 174 4.148382 5.2665e-05 0.0092691
## 38 -3.879754 1.4901e-04 0.0262260
## 176 3.774558 2.2082e-04 0.0388640
## 175 3.719951 2.6997e-04 0.0475150
qqPlot(mod_mul2, main="QQ Plot") #qq plot for studentized resid
## [1] 38 174
leveragePlots(mod_mul2) # leverage plots