Statistical Learning in R

Statistical learning is a branch of machine learningÌýin which computers find patterns without explicit programmingÌýand has a wide array of applications from speech recognition to sport analysis. In particular, supervised statistical learning is useful for regression or classification problems, for which we create a model based on training data and then test the model on new data. In this short course, we will explore the theory behind classification and also look at what packages R has to help us perform statistical learning. While a wide variety of methods exist, we will focus on the k-nearest neighbors algorithm and support vector machines.

Presenters: Katie Zagnoli and Ian Laga

Location: Ketchum IB87
Time: Monday, April 10, 5:00 - 7:00 PM

Two R files for this course iris.classification.R and SVM.R:

iris.classification.R

# Look at data (part of R) ##############################################################
data(iris)
head(iris)
summary(iris)

# Investigate a few 2D plots ############################################################
plot(iris$Sepal.Length,iris$Sepal.Width, col=iris$Species,pch=19,xlab='sepal length',ylab='sepal width',main='Iris species')
Ìý Ìýpar(xpd=TRUE)
Ìý Ìýlegend(7,5,unique(iris$Species),col=1:length(iris$Species),pch=19)

plot(iris$Petal.Length,iris$Petal.Width, col=iris$Species,pch=19,xlab='petal length',ylab='petal width',main='Iris species')
Ìý Ìýpar(xpd=TRUE)
Ìý Ìýlegend(5.5,1,unique(iris$Species),col=1:length(iris$Species),pch=19)

# KNN ###################################################################################
# install.packages("class")
library(class)

# Normalize the data? Consider the range for each predictor:Ìý
Ìý Ìýsummary(iris)
Ìý Ìý# This ensures that one feature does not dominate.
Ìý Ìýir.nm <- as.data.frame(lapply(iris[1:4], scale))
Ìý Ìýsummary(ir.nm)
Ìý Ìý
# Divide data into Train and TestÌý
Ìý Ìýset.seed(8)
Ìý Ìýtest.ind <- sample(seq(1:dim(ir.nm)[1]),50)
Ìý Ìýir.test <- ir.nm[test.ind,] Ìý Ìý Ìý Ìý Ìý Ìý Ìý Ìý# test/ train sets only contain predictor variables
Ìý Ìýir.train <- ir.nm[-test.ind,]
Ìý Ìýtest.lab <- iris$Species[test.ind] Ìý Ìý Ìý Ìý # store correct species labels for testing and training sets
Ìý Ìýtrain.lab <- iris$Species[-test.ind]
Ìý Ìýsummary(test.lab)

# KNNÌý
?knn
iris.pred <- knn(train=ir.train, test=ir.test, cl=train.lab, k=3)

# Contingency Table / Confusion Matrix Ìý#################################################
table(test.lab, iris.pred)
Ìý Ìý# 2 misclassifications for set.seed(8), k=3

# Calculate AccuracyÌý
mean(iris.pred == test.lab)

# Test Accuracy for different k valuesÌý
acc <- NULL
for (i in 1:15) {
Ìý Ìýiris.pred <- knn(train=ir.train, test=ir.test, cl=train.lab, k=i)
Ìý Ìýacc[i] <- mean(iris.pred == test.lab)*100
}
plot(acc,xlab='K',ylab='accuracy [%]',main='Accuracy for KNN on Iris Species', pch=19,col=2)
#########################################################################################

# How does K after the decision boundary? ###############################################
# Based on code by Professor Will KleiberÌý

# Synthetic data:Ìý
set.seed(4)
V0.0 <- rnorm(40,mean=1,sd=0.5)
V0.1 <- rnorm(60,mean=0,sd=0.5)
V1.0 <- rnorm(40,mean=-1,sd=1)
V1.1 <- rnorm(60,mean=0,sd=1)
dat <- data.frame(class=rep(c(0,1),times=c(40,60)), V0=c(V0.0,V0.1), V1=c(V1.0,V1.1))
plot(V1~V0,data=dat,asp=1) Ìý # set aspect ratio to 1 because v0 and v1 have different rangesÌý
points(V1~V0,data=dat,subset=dat$class==1,col="red",pch=19)
points(V1~V0,data=dat,subset=dat$class==0,col="blue",pch=19)
par(xpd=TRUE)
legend(2,3.3,c('class 0', 'class 1'), pch=c(19,19),col=c('blue','red'))

# Locations at which to make predictions:Ìý
predict.grid <- as.matrix(expand.grid(seq(-2,3,length.out=60),seq(-3.5,2,length.out=60)))
dim(predict.grid)
points(predict.grid,col="green",pch=19,cex=0.1)

# Plot decision boundary for different values of k: (Hit Enter to see plots)
par(mfrow=c(1,1),ask=TRUE) Ìý Ìý Ìý
vec <- c(1,2,3,4,5,6,7,10,18,27,38,50,100)
for(k in vec){
Ìý Ìýplot(V1~V0,data=dat,asp=1,main=paste("# of nearest neighbors =",k))
Ìý Ìýpoints(V1~V0,data=dat,subset=dat$class==1,col="red",pch=19)
Ìý Ìýpoints(V1~V0,data=dat,subset=dat$class==0,col="blue",pch=19)
Ìý Ìý
Ìý Ìýout <- knn(train=dat[,2:3],test=predict.grid,cl=dat[,1],k=k) Ìý Ìý # uses knn function to predict, cl=dat[,1] - tells what classifications are
Ìý Ìýpoints(predict.grid[out==0,],col="springgreen",pch=19,cex=0.6)
Ìý Ìýpoints(predict.grid[out==1,],col="yellow",pch=19,cex=0.6)
}
Ìý

SVM.R

install.packages("e1071")
library(e1071)
install.packages("ROCR")
library(ROCR)

set.seed(52)
dat <- iris[51:150,c(1,2,5)]
dat$Species <- factor(dat$Species)

plot(dat$Sepal.Length~dat$Sepal.Width, col="white")
points(dat[dat$Species=="versicolor",2], dat[dat$Species=="versicolor",1], col="blue")
points(dat[dat$Species!="versicolor",2], dat[dat$Species!="versicolor",1], col="red", pch=19)

train <- sample(1:dim(dat)[1], size=50, replace=FALSE)

dat.train <- dat[train,]
dat.test <- dat[-train,]

##
## SVM fit
##

## support vector classifier
fit.svc <- svm(Species~.,data=dat.train,kernel="linear",cost=1)
plot(fit.svc,data=dat.train)
# tuning
tune.out <- tune(svm,Species~.,data=dat.train,kernel="linear",
Ìý Ìý Ìý Ìý Ìý Ìý Ìý Ìý Ìýranges=list(cost=seq(0.01,5,length.out=100)),probability=TRUE)
fit.svc <- tune.out$best.model

fit.svc
plot(fit.svc,data=dat.train)

## support vector machine: polynomial
fit.svmp <- svm(Species~.,data=dat.train,kernel="polynomial",cost=10,degree=3)
plot(fit.svmp,data=dat.train)
# tuning
tune.out <- tune(svm,Species~.,data=dat.train,kernel="polynomial",degree=3,
Ìý Ìý Ìý Ìý Ìý Ìý Ìý Ìý Ìýranges=list(cost=seq(0.01,5,length.out=100)),probability=TRUE)
fit.svmp <- tune.out$best.model

fit.svmp
plot(fit.svmp,data=dat.train)

## svm: radial
fit.svmr <- svm(Species~.,data=dat.train,kernel="radial",cost=1)
plot(fit.svmr,data=dat.train)
# tuning
tune.out <- tune(svm,Species~.,data=dat.train,kernel="radial",
Ìý Ìý Ìý Ìý Ìý Ìý Ìý Ìý Ìýranges=list(cost=seq(0.01,5,length.out=100)),probability=TRUE)
fit.svmr <- tune.out$best.model

fit.svmr
plot(fit.svmr,data=dat.train)

##
## Testing set performance
##

# classifier
tab <- table(true=dat.test[,"Species"],pred=predict(fit.svc,newdata=dat.test))
tab
(tab[1,1]+tab[2,2])/dim(dat.test)[1]

# classifier
tab <- table(true=dat.test[,"Species"],pred=predict(fit.svmp,newdata=dat.test))
tab
(tab[1,1]+tab[2,2])/dim(dat.test)[1]

# classifier
tab <- table(true=dat.test[,"Species"],pred=predict(fit.svmr,newdata=dat.test))
tab
(tab[1,1]+tab[2,2])/dim(dat.test)[1]

##############
## ROC Plots
##############

probs.svc <- predict(fit.svc,dat.test,probability=TRUE)
pred.svc <- prediction(attributes(probs.svc)$probabilities[,2], dat.test$Species)Ìý
perf.svc <- performance(pred.svc,"tpr","fpr")

probs.svmp <- predict(fit.svmp,dat.test,probability=TRUE)
pred.svmp <- prediction(attributes(probs.svmp)$probabilities[,2], dat.test$Species)Ìý
perf.svmp <- performance(pred.svmp,"tpr","fpr")

probs.svmr <- predict(fit.svmr,dat.test,probability=TRUE)
pred.svmr <- prediction(attributes(probs.svmr)$probabilities[,2], dat.test$Species)Ìý
perf.svmr <- performance(pred.svmr,"tpr","fpr")

plot(perf.svc,col="red")
plot(perf.svmp,col="green", add=TRUE)
plot(perf.svmr,col="purple", add=TRUE)

######################################################
## Radial Example
######################################################

rm(list=ls())

set.seed(54)
x <- matrix(rnorm(500*2),ncol=2)
y <- vector(length=500)
for(i in 1:500){
Ìý if(sqrt(x[i,1]^2+x[i,2]^2) < 1+rnorm(1, sd=0.25)){
Ìý Ìý y[i] = 1
Ìý }
}

plot(x[,1],x[,2], col="white")
points(x[y==0,1],x[y==0,2], col="black")
points(x[y==1,1],x[y==1,2], col="red", pch=19)

dat <- data.frame(X=x,y=as.factor(y))

train <- sample(1:dim(dat)[1], size=100, replace=FALSE)

dat.train <- dat[train,]
dat.test <- dat[-train,]

##
## SVM fit
##

## support vector classifier
fit.svc <- svm(y~.,data=dat.train,kernel="linear",cost=1)
plot(fit.svc,data=dat.train)
# tuning
tune.out <- tune(svm,y~.,data=dat.train,kernel="linear",
Ìý Ìý Ìý Ìý Ìý Ìý Ìý Ìý Ìýranges=list(cost=seq(0.01,5,length.out=100)))
fit.svc <- tune.out$best.model

fit.svc
plot(fit.svc,data=dat.train)

## support vector machine: polynomial
fit.svmp <- svm(y~.,data=dat.train,kernel="polynomial",cost=10,degree=3)
plot(fit.svmp,data=dat.train)
# tuning
tune.out <- tune(svm,y~.,data=dat.train,kernel="polynomial",degree=3,
Ìý Ìý Ìý Ìý Ìý Ìý Ìý Ìý Ìýranges=list(cost=seq(0.01,5,length.out=100)))
fit.svmp <- tune.out$best.model

fit.svmp
plot(fit.svmp,data=dat.train)

## svm: radial
fit.svmr <- svm(y~.,data=dat.train,kernel="radial",cost=1)
plot(fit.svmr,data=dat.train)
# tuning
tune.out <- tune(svm,y~.,data=dat.train,kernel="radial",
Ìý Ìý Ìý Ìý Ìý Ìý Ìý Ìý Ìýranges=list(cost=seq(0.01,5,length.out=100)))
fit.svmr <- tune.out$best.model

fit.svmr
plot(fit.svmr,data=dat.train)

##
## Testing set performance
##

# classifier
tab <- table(true=dat.test[,"y"],pred=predict(fit.svc,newdata=dat.test))
tab
(tab[1,1]+tab[2,2])/dim(dat.test)[1]

# classifier
tab <- table(true=dat.test[,"y"],pred=predict(fit.svmp,newdata=dat.test))
tab
(tab[1,1]+tab[2,2])/dim(dat.test)[1]

# classifier
tab <- table(true=dat.test[,"y"],pred=predict(fit.svmr,newdata=dat.test))
tab
(tab[1,1]+tab[2,2])/dim(dat.test)[1]