[R] 2. Logistic Regression with Confusion Matrix
3 분 소요
Logistic Regression Practice with R
perf_eval1 <- function(cm){
# True positive rate : TPR, recall
TPR <- cm[2, 2]/sum(cm[2, ])
# Precision
PRE <- cm[2, 2]/sum(cm[, 2])
# True negative rate : TNR
TNR <- cm[1, 1]/sum(cm[1, ])
# Accuracy
ACC <- (cm[1, 1]+cm[2, 2])/sum(cm)
# Balanced Correction Rate
BCR <- sqrt(TPR*TNR)
# F-1 Measure
return(c(TPR, PRE, TNR, ACC, BCR, F1))
# Initialize the performance matrix
perf_mat <- matrix(0, 1, 6)
colnames(perf_mat) <- c("TPR (Recall)", "Precision", "TNR", "ACC", "BCR", "F1")
rownames(perf_mat) <- "Logistic Regression"
Dataset 1. Personal Loan
# Load Dataset
ploan <- read.csv("Personal Loan.csv")
input_idx <- c(2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14)
# except 1, 5, 10 -> 1, 5 : Unique Number \ 10 : Target Variable
target_idx <- 10
# Conduct the normalization
ploan_input <- ploan[, input_idx]
ploan_input <- scale(ploan_input, center = TRUE, scale = TRUE)
# for different measurement of each variable -> Ensuring Reliability of the Model
ploan_target <- ploan[, target_idx]
ploan_data <- data.frame(ploan_input, ploan_target)
# Split the data in to the training / test datasets
trn_idx <- sample(1:nrow(ploan_data), round(0.7*nrow(ploan_data)))
ploan_trn <- ploan_data[trn_idx, ]
ploan_tst <- ploan_data[-trn_idx, ]
# Train the Logistic Regression Model with all variables
full_lr <- glm(ploan_target ~ ., family = binomial, ploan_trn)
# family = binomial -> Logistic Regression
- 신용대출 상품 이용 여부에 대해서
Age, Experience, Mortgage, Online
은 중요한 변수가 아님
의 회귀 계수 = 2.28517 → 양수이기 때문에 Income
이 증가할수록 신용 대출을 이용할 확률이 높아짐 → 봉급이 많을수록 돈을 갚을 능력이 있다는 것을 의미
의 회귀 계수 = 0.56684 → 신용카드를 보유하고 있으면 대출을 사용할 가능성이 낮아짐 → 지금 당장 현금이 없어도 카드로 결제가 가능하기 때문에 대출을 받을 가능성이 낮아짐
lr_response <- predict(full_lr, type = "response", newdata = ploan_tst)
# Express the prediction of full_lr model using plan_test as probability(type = "response").
lr_target <- ploan_tst$ploan_target
lr_predicted <- rep(0, length(lr_target)) # Initialization
lr_predicted[which(lr_response >= 0.5)] <- 1
cm_full <- table(lr_target, lr_predicted) # Make a Confusion Matrix
perf_mat[1, ] <- perf_eval1(cm_full)
- TPR(recall) : 모델에 의해서 실제 대출을 한 고객들 중 64%가 정확하게 예측되었다.
- Precision : 대출을 할 것이라고 예측한 고객들 중 84%가 실제로 대출을 했다.
- 단순히 정확도(Accuracy)는 높지만, 균형 정확도(BCR), F1 지표는 비교적 낮은 것을 볼 수 있다. → 신용대출을 한 고객들보다 안 한 고객들이 더 많기 때문에 일어난 현상
perf_eval3 <- function(cm){
# simple accuracy
ACC <- sum(diag(cm))/sum(cm)
# ACC for each class
A1 <- cm[1, 1]/sum(cm[1, ])
A2 <- cm[2, 2]/sum(cm[2, ])
A3 <- cm[3, 3]/sum(cm[3, ])
BCR <- (A1*A2*A3)^(1/3)
return(c(ACC, BCR))
Dataset 2. Wine dataset from UCI
wine <- read.csv("wine.csv")
# Define the baseline class (int to factor)
wine$Class <- as.factor(wine$Class)
wine$Class <- relevel(wine$Class, ref = "3") # Define reference category as number 3
pairs(wine[, 2:6], main = "scatter plot for each class", pch = 21, bg = c("red", "green3", "blue"), cex = 1)
pairs(wine[, 7:11], main = "scatter plot for each class", pch = 21, bg = c("red", "green3", "blue"), cex = 1)
pairs(wine[, 12:14], main = "scatter plot for each class", pch = 21, bg = c("red", "green3", "blue"), cex = 1)
- 그림상으로는 분류하기 어려워 보인다 :( 확인해보자 !
# Data Split
trn_idx <- sample(1:nrow(wine), round(0.7*nrow(wine)))
wine_trn <- wine[trn_idx, ]
wine_tst <- wine[-trn_idx, ]
# Train Multinomial Logistic Regression
ml_logit <- multinom(Class ~ ., data = wine_trn)
# Check th ecoefficients
# Conduct 2-tailed z-test to compute the p-values
z_stats <- summary(ml_logit)$coefficients/summary(ml_logit)$standard.errors
p_value <- (1-pnorm(abs(z_stats), 0, 1))*2
options(scipen = 10)
cbind(t(summary(ml_logit)$coefficients), t(p_value))
변수는 1, 2번 범주와 3번 범주를 구분하는데 유의한 변수
의 값이 높아질수록 1번 범주일 확률은 올라가고 2번 범주일 확률은 낮아짐
# Predict the class probability
ml_logit_haty <- predict(ml_logit, type = "probs", newdata = wine_tst)
# Predict the class label
ml_logit_prey <- predict(ml_logit, newdata = wine_tst)
cfmatrix <- table(wine_tst$Class, ml_logit_prey)
perf_eval3(cfmatrix) # Accuracy and Balanced Correction Rate are similar in value.