# FROM: Dua, D. and Karra Taniskidou, E. (2017). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.
spamData = read.csv('spamData.csv')
names(spamData) <- c("make","address","all","3d","our","over","remove","internet","order","mail","receive","will","people","report","addresses","free","business","email","you","credit","your","font","000","money","hp","hpl","george","650","lab","labs","telnet","857","data","415","85","technology","1999","parts","pm","direct","cs","meeting","original","project","re","edu","table","conference",";:","(:","[:","!:","$:","#:","avg_cap_length","longest_cap_length","total_cap_length","spam_label")
n <- nrow(spamData)
shuffled <- spamData[sample(n),]
set.seed(1)
# Initialize the accs vector
accs <- rep(0,6)
for (i in 1:6) {
# These indices indicate the interval of the test set
indices <- (((i-1) * round((1/6)*nrow(shuffled))) + 1):((i*round((1/6) * nrow(shuffled))))
# Exclude them from the train set
train <- shuffled[-indices,]
# Include them in the test set
test <- shuffled[indices,]
# A model is learned using each training set
tree <- rpart(spam_label ~ ., train, method = "class")
# Make a prediction on the test set using tree
pred <- predict(tree, test, type = "class")
# Assign the confusion matrix to conf
conf <- table(test$spam_label, pred)
# Assign the accuracy of this model to the ith index in accs
accs[i] <- sum(diag(conf))/sum(conf)
}
print(accs)
## [1] 0.9022164 0.9022164 0.9152542 0.8748370 0.8826597 0.8877285
print(mean(accs))
## [1] 0.894152
# Print out the mean of accs
Bias and variance are main challenges of machine learning. bias are wrong assumptions. variance is due to sampling.
irriducilbe error: noise, shouldn’t be minimized. reducible error: bias and variance.
# Example of assigning levels to a predictor
#spam_classifier <- function(x){
# prediction <- rep(NA, length(x))
# prediction[x > 4] <- 1
# prediction[x <= 4] <- 0
# return(factor(prediction, levels = c("1", "0")))
#}
if (!require("rpart.plot")) install.packages("rpart.plot")
library(rpart.plot)
if (!require("RColorBrewer")) install.packages("RColorBrewer")
library(RColorBrewer)
if (!require("rattle")) install.packages("rattle")
library(rattle)
train_indices <- 1:round(0.7*n)
train <- shuffled[train_indices,]
test_indices <- (round(0.7*n)+1):n
test <- shuffled[test_indices, ]
tree <- rpart(spam_label ~ ., train, method = "class", parms = list(split = "information"))
pred <- predict(tree, test, type = "class")
conf = table(test$spam_label ,pred)
acc = sum(diag(conf))/sum(conf)
print(acc)
## [1] 0.8818841
print(conf)
## pred
## 0 1
## 0 769 66
## 1 97 448
fancyRpartPlot(tree)
# normalize data on a 0-1 scale
# knn_train$Age <- (knn_train$Age - min_age) / (max_age - min_age)
df <- read.table("./adultCensus.data", header = FALSE, sep = ",")
df_test <- read.table("./adultTest.test", header = FALSE, sep = ",", skip = 1)
names(df) <- c("age","workclass","fnlwgt","education", "education-num","maritalstatus","occupation","relationship","race","sex","capital-gain","capital-loss","hoursPerWeek","native-country","income")
names(df_test) <- c("age","workclass","fnlwgt","education", "education-num","maritalstatus","occupation","relationship","race","sex","capital-gain","capital-loss","hoursPerWeek","native-country","income")
df[1,]
df$fnlwgt <- NULL
tree <- rpart(income ~ ., df, method = "class", parms = list(split = "gini"))
all_probs = predict(tree, df_test, type = "prob")[,2]
all_probs %>% head()
## 1 2 3 4 5 6
## 0.04987988 0.30019374 0.30019374 0.98084291 0.04987988 0.04987988
fancyRpartPlot(tree)
if (!require("ROCR")) install.packages("ROCR")
library(ROCR)
pred = prediction(all_probs, df_test$income)
perf = performance(pred, "tpr", "fpr")
plot(perf)
perf = performance(pred, "auc")
print(perf@y.values[[1]])
## [1] 0.8451151
if (!require("class")) install.packages("class")
library(class)
train_indices <- 1:round(0.7*n)
train <- shuffled[train_indices,]
test_indices <- (round(0.7*n)+1):n
test <- shuffled[test_indices, ]
knn_train <- train
knn_test <- test
knn_train_labels <- knn_train$spam_label
knn_train$spam_label <- NULL
knn_test_labels <- knn_test$spam_label
knn_test$spam_label <- NULL
knn_train <- apply(knn_train, 2, function(x) (x - min(x))/(max(x)-min(x)))
knn_test <- apply(knn_test, 2, function(x) (x - min(x))/(max(x)-min(x)))
pred <- knn(train = knn_train, test = knn_test, cl = knn_train_labels, k = 5)
conf = table(knn_test_labels, pred)
print(conf)
## pred
## knn_test_labels 0 1
## 0 757 78
## 1 94 451
range <- 1:50
accs <- rep(0, length(range))
for (k in range){
pred <- knn(knn_train, knn_test, knn_train_labels, k=k)
conf <- table(knn_test_labels, pred)
accs[k] <- sum(diag(conf)) / sum(conf)
if (k %% 10 == 0){
print("10")
}
}
## [1] "10"
## [1] "10"
## [1] "10"
## [1] "10"
## [1] "10"
plot(range, accs, xlab = "k")
which.max(accs)
## [1] 10
train_indices <- 1:round(0.7*n)
train <- shuffled[train_indices,]
test_indices <- (round(0.7*n)+1):n
test <- shuffled[test_indices, ]
set.seed(1)
knn_train <- train
knn_test <- test
knn_train_labels <- knn_train$spam_label
knn_train$spam_label <- NULL
knn_test_labels <- knn_test$spam_label
knn_test$spam_label <- NULL
#knn_train <- apply(knn_train, 2, function(x) (x - mean(x))/sd(x))
#knn_test <- apply(knn_test, 2, function(x) (x - mean(x))/sd(x))
knn_train <- scale(knn_train)
knn_test <- scale(knn_test)
pred <- knn(train = knn_train, test = knn_test, cl = knn_train_labels, k = 5)
conf = table(knn_test_labels, pred)
print(conf)
## pred
## knn_test_labels 0 1
## 0 784 51
## 1 90 455
range <- 1:50
accs <- rep(0, length(range))
for (k in range){
pred <- knn(knn_train, knn_test, knn_train_labels, k=k)
conf <- table(knn_test_labels, pred)
accs[k] <- sum(diag(conf)) / sum(conf)
if (k %% 10 == 0){
print("10")
}
}
## [1] "10"
## [1] "10"
## [1] "10"
## [1] "10"
## [1] "10"
plot(range, accs, xlab = "k")
best = which.max(accs)
pred <- knn(train = knn_train, test = knn_test, cl = knn_train_labels, k = best, prob = TRUE)
conf = table(knn_test_labels, pred)
acc = sum(diag(conf))/sum(conf)
print(conf)
## pred
## knn_test_labels 0 1
## 0 780 55
## 1 91 454
print(paste(acc, " knn accuracy"))
## [1] "0.894202898550725 knn accuracy"
probs_k <- attr(pred, "prob")
tree <- rpart(spam_label ~ ., train, method = "class")
probs_t <- predict(tree, test, type = "class")
conf = table(test$spam_label, probs_t)
acc = sum(diag(conf))/sum(conf)
print(conf)
## probs_t
## 0 1
## 0 778 57
## 1 105 440
print(paste(acc, " tree accuracy"))
## [1] "0.882608695652174 tree accuracy"
probs_t <- predict(tree, test, type = "prob")[,2]
pred_t = prediction(probs_t, test$spam_label)
pred_k <- prediction(probs_k, test$spam_label)
perf_t = performance(pred_t, "auc")
perf_k = performance(pred_k, "auc")
#ROC_curves(perf_t, perf_k)
print(perf_t@y.values[[1]])
## [1] 0.886733
print(perf_k@y.values[[1]])
## [1] 0.4365995
if (!require("ggvoronoi")) install.packages("ggvoronoi")
library(ggvoronoi)
x <- sample(1:150,50)
y <- sample(1:150,50)
points <- data.frame(x, y, distance = sqrt((x-75)^2 + (y-75)^2))
ggplot(points) +
geom_voronoi(aes(x,y,fill=distance)) +
scale_fill_gradient(low="#4dffb8",high="navyblue",guide=F) +
geom_point(aes(x,y)) +
theme_void() +
coord_fixed()
Copyright © 2019 Tomas Leriche. All rights reserved.