[R] 1. Multiple Linear Regression
Multiple Linear Regression Practice with RPermalink
FunctionsPermalink
# Performance evaluation function for regression
perf_eval_reg <- function(tgt_y, pre_y){
#RMSE
rmse <- sqrt(mean((tgt_y - pre_y)^2))
#MAE
mae <- mean(abs(tgt_y - pre_y))
#MAPE
mape <- 100* mean(abs((tgt_y - pre_y)/tgt_y))
return(c(rmse, mae, mape))
}
# Initialize a performance summary
perf_mat <- matrix(0, nrow = 2, ncol = 3)
rownames(perf_mat) <- c("Toyota Coroola", "Boston Housing")
colnames(perf_mat) <- c("RMSE", "MAE", "MAPE")
perf_mat
Dataset 1: Toyota CorollaPermalink
corolla <- read.csv("ToyotaCorolla.csv")
View(corolla)
#Indices for the inactivated input variables
id_idx <- c(1, 2) # save `id` and `model` columns
# Remove irrelevant columns
corolla_data <- corolla[, -id_idx] # remove `id` and `model` columns
View(corolla_data)
# Check the linearity betwwen X variables and Y variable
plot_data <- corolla_data[, -6] # remove `Fuel_Type` column
corolla_names <- colnames(plot_data)[-1] # remove `price` column
par(mfrow = c(5, 7))
for (i in 1:length(corolla_names)){
plot(Price ~ plot_data[, i+1], data = plot_data, xlab = corolla_names[i])
}
# `age_08_04` and `Km` are negatively correlated variables
# `HP` and `Weight` are weak positive correlations
# `cc` has a outlier
dev.off()
# One outlier exists for cc variable
cc_outlier <- which(plot_data$cc > 15000) # you can remove the outlier with domain
# Select linearly related variables with "Price"
plot_data_selected <- plot_data[-cc_outlier, c(1, 2, 4, 5, 6, 9, 13, 14)]
corolla_names <- colnames(plot_data_selected)[-1]
par(mfrow = c(2, 4))
for (i in 1:length(corolla_names)){
plot(Price ~ plot_data_selected[, i+1], data = plot_data_selected,
xlab = corolla_names[i])
} # check these are significant variables later :)
dev.off() # remove plots
# Split the data into the training / validation sets
corolla_mlr_data <- corolla_data[-cc_outlier, ] # make a Multiple Linear Regression's dataset
nCar <- nrow(corolla_mlr_data)
# Fix the seed for random number geneeration
set.seed(12345)
corolla_trn_idx <- sample(1:nCar, round(0.7*nCar))
corolla_trn_data <- corolla_mlr_data[corolla_trn_idx, ]
corolla_tst_data <- corolla_mlr_data[-corolla_trn_idx, ]
# Train the MLR
mlr_corolla <- lm(Price ~ ., data = corolla_trn_data) # . means all columns
mlr_corolla
summary(mlr_corolla) # Pr(>|t|) -> check 3 stars variables :)
# ex `Age_08_04` : The later it is released a moth, the lower the price is by 114 euros
plot(mlr_corolla)
dev.off()
# Plot the result
plot(corolla_trn_data$Price, fitted(mlr_corolla), xlim = c(4000, 35000), ylim = c(4000, 35000))
abline(0, 1, lty=3)
# R^2 and adj_R^2 in summary of mlr_corolla are over 0.9 -> that means it indicated a strong linear relation
# normality test of residuals
corolla_resid <- resid(mlr_corolla)
m <- mean(corolla_resid)
std <- sqrt(var(corolla_resid))
hist(corolla_resid, density = 20, breaks = 50, prob = TRUE, xlab = "x-variable", main = "normal curve over histogram")
curve(dnorm(x, mean = m, sd = std), col = 'darkblue', lwd = 2, add = TRUE, yaxt = "n")
skewness(corolla_resid) # normal distribution's skewness is 0
kurtosis(corolla_resid) # normal distribution's kurtosis is 3
# Performance Measure
mlr_corolla_haty <- predict(mlr_corolla, newdata = corolla_tst_data) # ignore the warning message
perf_mat[1, ] <- perf_eval_reg(corolla_tst_data$Price, mlr_corolla_haty)
perf_mat # MAE is 804 -> There is 804 Euro difference between the actual price and the predicted price -> Error rate 8.04%
Dataset 2. Boston HousingPermalink
boston_housing <- read.csv("BostonHousing.csv")
# Variable Description
# CRIM : per capita crime rate by town
# ZN : proportion of residential land zoned for lots ouver 25.000 sq.ft.
# INDUS : proportion of non-retail business acres per town
# NOX : nitrogen oxides concentration (parts per 10 million)
# RM : average number of rooms per dwelling
# AGE : proportion of owner-occupied units built prior to 1940
# DIS : weightd mean of distances to five Boston employment centres
# TAX : full-value proporty-tax rate per /$10,000
# PTRATION : pupil-teacher ratio by town
# Black :: 100(Bk-0.63)^2 where Bk is the proportion of blacks by town
# LSTAT : lower status of the population (percent)
# MEDV : median value of owner-occupied homes in /$1000s
boston_names <- colnames(boston_housing)[-ncol(boston_housing)]
par(mfrow = c(3, 4))
for (i in 1:length(boston_names)){
plot(MEDV ~ boston_housing[, i], data = boston_housing, xlab = boston_names[i])
}
dev.off()
# It can be seen that quite a few variables show an increasing or decreasing pattern
nHome <- nrow(boston_housing)
nVar <- ncol(boston_housing)
# Split the data into the training/test sets
set.seed(12345)
boston_trn_idx <- sample(1:nHome, round(0.7*nHome))
boston_trn_data <- boston_housing[boston_trn_idx, ]
boston_tst_data <- boston_housing[-boston_trn_idx, ]
# Trian the MLR
mlr_boston <- lm(MEDV ~ ., data = boston_trn_data)
summary(mlr_boston)
plot(mlr_boston)
# Plot the result
plot(boston_trn_data$MEDV, fitted(mlr_boston), xlim = c(-5, 50), ylim = c(-5, 50))
abline(0, 1, lty = 3) # Residuals and predicted values represent a binary relationship
# The regression coefficient estimated by the OLS does not meet the minimum conditions required.
# -> It's not statistically correct, but it's worth a try if you're good at predicting
# Normality test of residuals
boston_resid <- resid(mlr_boston)
m <- mean(boston_resid)
std <- sqrt(var(boston_resid))
hist(boston_resid, density = 20, breaks = 50, prob = TRUE,
xlab = "x-variable", main = "normal curve over histogram")
curve(dnorm(x, mean = m, sd = std), col = "darkblue", lwd = 2, add = TRUE, yaxt = "n")
skewness(boston_resid)
kurtosis(boston_resid)
댓글남기기