corolla <- read.csv("ToyotaCorolla.csv")
View(corolla)
#Indices for the inactivated input variables
id_idx <- c(1, 2) # save `id` and `model` columns
# Remove irrelevant columns
corolla_data <- corolla[, -id_idx] # remove `id` and `model` columns
View(corolla_data)
# Check the linearity betwwen X variables and Y variable
plot_data <- corolla_data[, -6] # remove `Fuel_Type` column
corolla_names <- colnames(plot_data)[-1] # remove `price` column
par(mfrow = c(5, 7))
for (i in 1:length(corolla_names)){
plot(Price ~ plot_data[, i+1], data = plot_data, xlab = corolla_names[i])
}
# `age_08_04` and `Km` are negatively correlated variables
# `HP` and `Weight` are weak positive correlations
# `cc` has a outlier
dev.off()
# One outlier exists for cc variable
cc_outlier <- which(plot_data$cc > 15000) # you can remove the outlier with domain
# Select linearly related variables with "Price"
plot_data_selected <- plot_data[-cc_outlier, c(1, 2, 4, 5, 6, 9, 13, 14)]
corolla_names <- colnames(plot_data_selected)[-1]
par(mfrow = c(2, 4))
for (i in 1:length(corolla_names)){
plot(Price ~ plot_data_selected[, i+1], data = plot_data_selected,
xlab = corolla_names[i])
} # check these are significant variables later :)
dev.off() # remove plots
# Split the data into the training / validation sets
corolla_mlr_data <- corolla_data[-cc_outlier, ] # make a Multiple Linear Regression's dataset
nCar <- nrow(corolla_mlr_data)
# Fix the seed for random number geneeration
set.seed(12345)
corolla_trn_idx <- sample(1:nCar, round(0.7*nCar))
corolla_trn_data <- corolla_mlr_data[corolla_trn_idx, ]
corolla_tst_data <- corolla_mlr_data[-corolla_trn_idx, ]
# Train the MLR
mlr_corolla <- lm(Price ~ ., data = corolla_trn_data) # . means all columns
mlr_corolla
summary(mlr_corolla) # Pr(>|t|) -> check 3 stars variables :)
# ex `Age_08_04` : The later it is released a moth, the lower the price is by 114 euros
plot(mlr_corolla)
dev.off()
# Plot the result
plot(corolla_trn_data$Price, fitted(mlr_corolla), xlim = c(4000, 35000), ylim = c(4000, 35000))
abline(0, 1, lty=3)
# R^2 and adj_R^2 in summary of mlr_corolla are over 0.9 -> that means it indicated a strong linear relation
# normality test of residuals
corolla_resid <- resid(mlr_corolla)
m <- mean(corolla_resid)
std <- sqrt(var(corolla_resid))
hist(corolla_resid, density = 20, breaks = 50, prob = TRUE, xlab = "x-variable", main = "normal curve over histogram")
curve(dnorm(x, mean = m, sd = std), col = 'darkblue', lwd = 2, add = TRUE, yaxt = "n")
skewness(corolla_resid) # normal distribution's skewness is 0
kurtosis(corolla_resid) # normal distribution's kurtosis is 3
# Performance Measure
mlr_corolla_haty <- predict(mlr_corolla, newdata = corolla_tst_data) # ignore the warning message
perf_mat[1, ] <- perf_eval_reg(corolla_tst_data$Price, mlr_corolla_haty)
perf_mat # MAE is 804 -> There is 804 Euro difference between the actual price and the predicted price -> Error rate 8.04%
댓글남기기