[R] 1. Multiple Linear Regression

4 분 소요

Multiple Linear Regression Practice with R

Functions

# Performance evaluation function for regression 
perf_eval_reg <- function(tgt_y, pre_y){
  #RMSE
  rmse <- sqrt(mean((tgt_y - pre_y)^2))
  #MAE
  mae <- mean(abs(tgt_y - pre_y))
  #MAPE
  mape <- 100* mean(abs((tgt_y - pre_y)/tgt_y))
  
  return(c(rmse, mae, mape))
}

# Initialize a performance summary 
perf_mat <- matrix(0, nrow = 2, ncol = 3)
rownames(perf_mat) <- c("Toyota Coroola", "Boston Housing")
colnames(perf_mat) <- c("RMSE", "MAE", "MAPE")
perf_mat

Dataset 1: Toyota Corolla

corolla <- read.csv("ToyotaCorolla.csv")
View(corolla)

#Indices for the inactivated input variables 
id_idx <- c(1, 2) # save `id` and `model` columns

# Remove irrelevant columns
corolla_data <- corolla[, -id_idx] # remove `id` and `model` columns
View(corolla_data)

# Check the linearity betwwen X variables and Y variable 
plot_data <- corolla_data[, -6] # remove `Fuel_Type` column
corolla_names <- colnames(plot_data)[-1] # remove `price` column

par(mfrow = c(5, 7))
for (i in 1:length(corolla_names)){
  plot(Price ~ plot_data[, i+1], data = plot_data, xlab = corolla_names[i])
} 
# `age_08_04` and `Km` are negatively correlated variables
# `HP` and `Weight` are weak positive correlations 
# `cc` has a outlier
dev.off()

# One outlier exists for cc variable 
cc_outlier <- which(plot_data$cc > 15000) # you can remove the outlier with domain

# Select linearly related variables with "Price"
plot_data_selected <- plot_data[-cc_outlier, c(1, 2, 4, 5, 6, 9, 13, 14)]
corolla_names <- colnames(plot_data_selected)[-1]

par(mfrow = c(2, 4))
for (i in 1:length(corolla_names)){
  plot(Price ~ plot_data_selected[, i+1], data = plot_data_selected, 
       xlab = corolla_names[i])
} # check these are significant variables later :)
dev.off() # remove plots

# Split the data into the training / validation sets 
corolla_mlr_data <- corolla_data[-cc_outlier, ] # make a Multiple Linear Regression's dataset
nCar <- nrow(corolla_mlr_data)

# Fix the seed for random number geneeration
set.seed(12345)
corolla_trn_idx <- sample(1:nCar, round(0.7*nCar))
corolla_trn_data <- corolla_mlr_data[corolla_trn_idx, ]
corolla_tst_data <- corolla_mlr_data[-corolla_trn_idx, ]

# Train the MLR 
mlr_corolla <- lm(Price ~ ., data = corolla_trn_data) # . means all columns
mlr_corolla
summary(mlr_corolla) # Pr(>|t|) -> check 3 stars variables :) 
                     # ex `Age_08_04` : The later it is released a moth, the lower the price is by 114 euros
plot(mlr_corolla)
dev.off()
# Plot the result
plot(corolla_trn_data$Price, fitted(mlr_corolla), xlim = c(4000, 35000), ylim = c(4000, 35000))
abline(0, 1, lty=3)
  # R^2 and adj_R^2 in summary of mlr_corolla are over 0.9 -> that means it indicated a strong linear relation

# normality test of residuals 
corolla_resid <- resid(mlr_corolla)

m <- mean(corolla_resid)
std <- sqrt(var(corolla_resid))

hist(corolla_resid, density = 20, breaks = 50, prob = TRUE, xlab = "x-variable", main = "normal curve over histogram")
curve(dnorm(x, mean = m, sd = std), col = 'darkblue', lwd = 2, add = TRUE, yaxt = "n")

skewness(corolla_resid) # normal distribution's skewness is 0
kurtosis(corolla_resid) # normal distribution's kurtosis is 3

# Performance Measure
mlr_corolla_haty <- predict(mlr_corolla, newdata = corolla_tst_data) # ignore the warning message 

perf_mat[1, ] <- perf_eval_reg(corolla_tst_data$Price, mlr_corolla_haty)
perf_mat # MAE is 804 -> There is 804 Euro difference between the actual price and the predicted price -> Error rate 8.04%

Dataset 2. Boston Housing

boston_housing <- read.csv("BostonHousing.csv")

# Variable Description
# CRIM : per capita crime rate by town
# ZN : proportion of residential land zoned for lots ouver 25.000 sq.ft.
# INDUS : proportion of non-retail business acres per town
# NOX : nitrogen oxides concentration (parts per 10 million) 
# RM : average number of rooms per dwelling
# AGE : proportion of owner-occupied units built prior to 1940
# DIS : weightd mean of distances to five Boston employment centres
# TAX : full-value proporty-tax rate per /$10,000
# PTRATION : pupil-teacher ratio by town
# Black :: 100(Bk-0.63)^2 where Bk is the proportion of blacks by town
# LSTAT : lower status of the population (percent)
# MEDV : median value of owner-occupied homes in /$1000s

boston_names <- colnames(boston_housing)[-ncol(boston_housing)]

par(mfrow = c(3, 4))
for (i in 1:length(boston_names)){
  plot(MEDV ~ boston_housing[, i], data = boston_housing, xlab = boston_names[i])
}
dev.off()
# It can be seen that quite a few variables show an increasing or decreasing pattern

nHome <- nrow(boston_housing)
nVar <- ncol(boston_housing)

# Split the data into the training/test sets
set.seed(12345)
boston_trn_idx <- sample(1:nHome, round(0.7*nHome))
boston_trn_data <- boston_housing[boston_trn_idx, ]
boston_tst_data <- boston_housing[-boston_trn_idx, ]

# Trian the MLR 
mlr_boston <- lm(MEDV ~ ., data = boston_trn_data)
summary(mlr_boston)
plot(mlr_boston)

# Plot the result 
plot(boston_trn_data$MEDV, fitted(mlr_boston), xlim = c(-5, 50), ylim = c(-5, 50))
abline(0, 1, lty = 3) # Residuals and predicted values represent a binary relationship
                      # The regression coefficient estimated by the OLS does not meet the minimum conditions required.
                      # -> It's not statistically correct, but it's worth a try if you're good at predicting

# Normality test of residuals 
boston_resid <- resid(mlr_boston)

m <- mean(boston_resid)
std <- sqrt(var(boston_resid))

hist(boston_resid, density = 20, breaks = 50, prob = TRUE, 
     xlab = "x-variable", main = "normal curve over histogram")

curve(dnorm(x, mean = m, sd = std), col = "darkblue", lwd = 2, add = TRUE, yaxt = "n")

skewness(boston_resid)
kurtosis(boston_resid)

댓글남기기