if(!require("knitr")) install.packages("knitr")
## Loading required package: knitr
if(!require("MASS")) install.packages("MASS")
## Loading required package: MASS
if(!require("class")) install.packages("class")
## Loading required package: class
if(!require("e1071")) install.packages("e1071")
## Loading required package: e1071
if(!require("pROC")) install.packages("pROC")
## Loading required package: pROC
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
if(!require("devtools")) install.packages("devtools")
## Loading required package: devtools
## Loading required package: usethis
library("devtools") # for install_github
if(!require("fastAdaboost")) devtools::install_github("souravc83/fastAdaboost")
## Loading required package: fastAdaboost
library("knitr") #
library("MASS") # For logistic regression
library("class") # For knn
library("e1071") # For SVM
library("pROC") # For ROCs
library("fastAdaboost") # For adaboost
This workshop examines classification performance. This consists of a few stages:
We’re making a simple classification problem out of the netflow data: can we tell what is HTTP?
We will use the Bro log data from Secrepo which has been used to generate several slides.
We will make a data frame consisting of only “good” features, omit missing data, and code the “service” feature as binary by whether it is http or not, which we use as our response variable to be predicted.
NB: This is implemented on GitHub but we are doing it manually here, for clarity and exposition.
## This R script reads data:
## The Bro log data from [Secrepo](http://www.secrepo.com/Datasets%20Description/HTML_Bro_log_1/conn.html):
## assigns headers to it and makes a useful subset for data exploration.
## Get the data from the repo
conndata=read.table("https://github.com/dsbristol/dst/raw/master/data/conn_sample.log",as.is=T)
## Assign column names
colnames(conndata)=c('ts','uid','id.orig_h','id.orig_p',
'id.resp_h','id.resp_p','proto','service','duration',
'orig_bytes','resp_bytes','conn_state','local_orig',
'missed_bytes','history','orig_pkts','orig_ip_bytes',
'resp_pkts','resp_ip_bytes','tunnel_parents')
## Make a version of the data that uses only the top 20 most frequently used ports
topports=names(head(sort(table(conndata[,"id.resp_p"]),decreasing=T),20))
conndata2=conndata[conndata[,"id.resp_p"]%in%topports,]
conndataC=conndata[,c("service","duration","orig_bytes",
"resp_bytes","orig_ip_bytes","resp_ip_bytes")]
conndataC=conndataC[!apply(conndataC,1,function(x)any(x=="-")),]
for(i in c(2:6)) conndataC[,i]=log(1+as.numeric(conndataC[,i]))
for(i in c(1)) conndataC[,i]=as.factor(conndataC[,i])
conndataC[,"http"]=as.numeric(conndataC[,"service"]!="http")
conndataC=conndataC[,-1]
We will make two versions of the data. The first uses the 5 features we’ve just picked out. The second is a cut-down version with only two features for understanding and visualisation.
set.seed(1)
set1=sample(1:dim(conndataC)[1],floor(dim(conndataC)[1]/2))
train=conndataC[set1,]
test=conndataC[-set1,]
train0=train[,c("duration","resp_ip_bytes","http")]
test0=test[,c("duration","resp_ip_bytes","http")]
Save this for use later (we use it in Python in Block 06):
write.csv(test,file=file.path("..","data","conndataC_test.csv"))
write.csv(train,file=file.path("..","data","conndataC_train.csv"))
Logistic regression is a type of generalised linear model. Look up
?glm
to see how to implement it.
We’re giving it it’s own function so that we don’t have to use a special interface to access it.
logisticregression=function(formula,data,...){
glm(formula,family=binomial(link='logit'),data=data,...)
}
testlogistic <- logisticregression(http ~.,data=train)
plot(testlogistic)
We will want to do the same thing with many different classifiers, so we’ll create a standardized interface to keep the coding down. This will automatically run an ROC curve computation.
We are able to specify: * a modelclass
(that works like
logisticregression
above), * a formula
to tell
it which is the response (y~.
is the simplest form for a
response called y
) * a train
dataset * a
test
dataset * a predictfn
which is a function
that takes a model
and returns a
continuous response. We will need this later to force
some other classes to return the right structure of the data. *
...
: additional parameters to the model
learnmodel=function(modelclass,formula,train,test,
predictfn=function(x,newdata)predict(x,newdata,type="response"),
...){
## Start by sorting out the formula target
if(class(formula)=="character") formula=formula(formula)
y=as.character(formula[2])
## Now run the learning
model=modelclass(formula,data=train,...)
## Predict on training data
trainpred0=predictfn(model,newdata=train)
trainpred=ifelse(trainpred0 > 0.5,1,0)
## Predict on testing data
testpred0=predictfn(model,newdata=test)
testpred=ifelse(testpred0 > 0.5,1,0)
## Organise the data for return
trainres=data.frame(truth=train[,y],pred=trainpred,pred0=trainpred0)
testres=data.frame(truth=test[,y],pred=testpred,pred0=testpred0)
## Compute ROC
testroc=roc(truth~pred0,data=as.data.frame(testres))
list(model=model,
trainres=trainres,
testres=testres,
train=train,
test=test,
roc=testroc)
}
Now we’ll run each of the classifiers in turn.
logisticmodel=learnmodel(logisticregression,http~.,train,test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(logisticmodel$roc)
We can see that we do well for the first 50% of true positives, but then lose power.
Knn is really annoying to standardize because there is no “model learning” step, so the interface differs. We’ve standardized it below.
Read the code and add the correct call to
learnmodel(knnclass,...)
below.
knnclass=function(formula,data,k){
## knn does not provide the usual interface, so we define it from scratch here
## We need to know what the x and y parts of y~x are, and to store all the data and k
if(class(formula)=="character") formula=formula(formula)
y=as.character(formula[2])
x=labels(terms(formula, data=data))
ret=list(formula=formula,train=data,k=k,x=x,y=y)
class(ret)="knnclass"
ret
}
predict.knnclass=function(x,newdata,...){
## knn can now be run on the new data. It returns the results as a factor with attributes "pr" where the probability of that classification is made. So we have to transform this into a probability.
predtmp=knn(x$train[,x$x], newdata[,x$x], x$train[,x$y], k = x$k, prob=TRUE)
pred0=attr(predtmp,"pr")
pred=as.numeric(predtmp)-1
pred0[pred==0] = 1-pred0[pred==0]
pred0
}
## use the new functions
knnmodel=learnmodel(knnclass,http~.,train,test,k=5,predictfn=predict.knnclass)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(knnmodel$roc)
What do you think about performance?
Again, LDA is close but requires a tweak to the prediction function ### LDA
predictlda=function(x,newdata,...){
## The lda class hides the scores in the posterior
predict(x,newdata,type="response")$posterior[,1]
}
ldamodel=learnmodel(lda,http~.,train,test,predictfn=predictlda)
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases
plot(ldamodel$roc)
SVM has a relatively straitforward interface, with only a “kernel” to be specified.
svmmodel=learnmodel(svm,http~.,train,test,kernel="linear")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(svmmodel$roc)
svmrmodel=learnmodel(svm,http~.,train,test,kernel="radial")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(svmrmodel$roc)
This just lets us see the details of the fit, in particular the number of support vectors
print(svmmodel$model)
##
## Call:
## svm(formula = formula, data = train, kernel = "linear")
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: linear
## cost: 1
## gamma: 0.2
## epsilon: 0.1
##
##
## Number of Support Vectors: 1642
print(svmrmodel$model)
##
## Call:
## svm(formula = formula, data = train, kernel = "radial")
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: radial
## cost: 1
## gamma: 0.2
## epsilon: 0.1
##
##
## Number of Support Vectors: 923
How does SVM compare at model-learning and prediction time to K-NN?
about interpreting these models.
Interpret the ROC curve. Which methods are working well? Why?
We plot the ROC curves together for comparison.
par(mfrow=c(1,2))
for(xmax in c(1,0.05)){
plot(1-logisticmodel$roc$specificities,logisticmodel$roc$sensitivities,type="l",
xlab="False Positive Rate",ylab="True Positive Rate",xlim=c(0,xmax),col=2)
lines(1-ldamodel$roc$specificities,ldamodel$roc$sensitivities,col=3)
lines(1-svmmodel$roc$specificities,svmmodel$roc$sensitivities,col=4)
lines(1-svmrmodel$roc$specificities,svmrmodel$roc$sensitivities,col=4,lty=2)
lines(1-knnmodel$roc$specificities,knnmodel$roc$sensitivities,col=5,lty=2)
abline(a=0,b=1)
if(xmax==1){
legend("bottomright",c("Logistic","LDA","SVM (linear)","SVM (radial)","K-NN"),
text.col=c(2,3,4,4,5),col=c(2,3,4,4,5),lty=c(1,1,1,2,1),bty="n")
}
}
The linear methods are all fundamentally restrained because there is no linear separation in the data. However, they do very well at low FPR.
about interpreting model performance.
We’re going to make a 2D plot of the input space and color it by class prediction.
Running the classification is now easy since we’ve made the interface.
################
## Simple plots for illustration
## Using 2 predictors only
logistic2dmodel <- learnmodel(logisticregression,http ~.,train=train0,test=test0)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
knn2dmodel <- learnmodel(knnclass,http ~.,train=train0,test=test0,k=5,predictfn=predict.knnclass)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
lda2dmodel <- learnmodel(lda,http ~.,train=train0,test=test0,predictfn=predictlda)
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases
svm2dmodel <- learnmodel(svm,http ~.,train=train0,test=test0,kernel="linear")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
svmr2dmodel <- learnmodel(svm,http ~.,train=train0,test=test0,kernel="radial")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
Here we make a grid to see what each classifier is doing around the space.
make.grid = function(x, n = 75) {
## Make a grid with n X n values over the range of x[,1] and x[,2]
grange = apply(x, 2, range)
x1 = seq(from = grange[1,1], to = grange[2,1], length = n)
x2 = seq(from = grange[1,2], to = grange[2,2], length = n)
ret=expand.grid(X1 = x1, X2 = x2)
colnames(ret)=colnames(x)
ret
}
traingrid = make.grid(train0[,1:2])
svm2dgrid = predict(svm2dmodel$model, traingrid)>0
svmr2dgrid = predict(svmr2dmodel$model, traingrid)>0
knn2dgrid=predict(knn2dmodel$model,traingrid)>0
logistic2dgrid=predict(logistic2dmodel$model,traingrid)>0
par(mfrow=c(2,2))
plot(traingrid, col = c("orange","cyan")[1+as.numeric(logistic2dgrid)], pch = 20, cex = .5,main="Logistic")
points(train0[,1:2], col = c("red","blue")[train0[,3] + 1], pch = 19,cex=1)
plot(traingrid, col = c("orange","cyan")[1+as.numeric(knn2dgrid)], pch = 20, cex = .5,main="K-NN (K=5)")
points(train0[,1:2], col = c("red","blue")[train0[,3] + 1], pch = 19,cex=1)
plot(traingrid, col = c("orange","cyan")[1+as.numeric(svm2dgrid)], pch = 20, cex = .5,main="Linear SVM")
points(train0[,1:2], col = c("red","blue")[train0[,3] + 1], pch = 19,cex=1)
points(train0[svm2dmodel$model$index,1:2], pch = 5, cex = 1)
plot(traingrid, col = c("orange","cyan")[1+as.numeric(svmr2dgrid)], pch = 20, cex = .5,main="Radial SVM")
points(train0[,1:2], col = c("red","blue")[train0[,3] + 1], pch = 19,cex=1)
points(train0[svm2dmodel$model$index,1:2], pch = 5, cex = 1)
about interpreting the input space.
Here we try stacking the data.
To do that, we make a dataset that contains the train data, stacked with the output from the 3 linear classifiers. Pass this into learnmodel using logisticregression.
(We’ll interpret this after running boosting)
lintrain=cbind(train,
data.frame(logistic=logisticmodel$trainres$pred0,
lda=ldamodel$trainres$pred0,
svm=svmmodel$trainres$pred0))
lintest=cbind(test,
data.frame(logistic=logisticmodel$testres$pred0,
lda=ldamodel$testres$pred0,
svm=svmmodel$testres$pred0))
linstackmodel=learnmodel(logisticregression,http~.,lintrain,lintest)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(linstackmodel$roc)
Boosting uses a default learner and this implementation does not allow custom learners.
predictada=function(x,newdata){
predict(x,newdata=newdata,type='response')$prob[,2]
}
adamodel=learnmodel(adaboost,http~.,train,test,predictfn=predictada,nIter=10)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
The code below plots the new boosting and stacking ROC curves. How do they perform? Why do you think that the performance is ordered as we see?
par(mfrow=c(1,2))
for(xmax in c(1,0.05)){
plot(1-logisticmodel$roc$specificities,logisticmodel$roc$sensitivities,type="l",
xlab="False Positive Rate",ylab="True Positive Rate",xlim=c(0,xmax),col=2)
lines(1-ldamodel$roc$specificities,ldamodel$roc$sensitivities,col=3)
lines(1-svmmodel$roc$specificities,svmmodel$roc$sensitivities,col=4)
lines(1-svmrmodel$roc$specificities,svmrmodel$roc$sensitivities,col=4,lty=2)
lines(1-knnmodel$roc$specificities,knnmodel$roc$sensitivities,col=5,lty=2)
lines(1-linstackmodel$roc$specificities,linstackmodel$roc$sensitivities,col=6,lty=2)
lines(1-adamodel$roc$specificities,adamodel$roc$sensitivities,col=7,lty=2)
abline(a=0,b=1)
if(xmax==1){
legend("bottomright",c("Logistic","LDA","SVM (linear)","SVM (radial)","K-NN","Stacked linear methods","Adaboost"),
text.col=c(2,3,4,4,5,6,7),col=c(2,3,4,4,5,6,7),lty=c(1,1,1,2,1,2,2),bty="n")
}
}
about interpreting the meta-learning approaches.