Workshop 6.2.1 Overview

This is Block 6 Workshop (which is 6.2) part 1. Part 2 is in Python.

This adds trees and forests to our experimentations from block 5 on classification.

We implement a Random Forest in R, and save some data for comparison to python. This uses the same representation as we used in block05.

0. Requirements

if(!require("gbm")) install.packages("gbm")
## Loading required package: gbm
## Loaded gbm 2.1.8.1
if(!require("knitr")) install.packages("knitr")
## Loading required package: knitr
if(!require("MASS")) install.packages("MASS")
## Loading required package: MASS
if(!require("class")) install.packages("class")
## Loading required package: class
if(!require("e1071")) install.packages("e1071")
## Loading required package: e1071
if(!require("pROC")) install.packages("pROC")
## Loading required package: pROC
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
if(!require("devtools")) install.packages("devtools")
## Loading required package: devtools
## Loading required package: usethis
if(!require("tree")) install.packages("tree")
## Loading required package: tree
if(!require("randomForest")) install.packages("randomForest")
## Loading required package: randomForest
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library("devtools") # for install_github
if(!require("fastAdaboost")) devtools::install_github("souravc83/fastAdaboost")
## Loading required package: fastAdaboost
library("knitr") # 
library("MASS") # For logistic regression
library("class") # For knn
library("e1071") # For SVM
library("pROC") # For ROCs
library("fastAdaboost") # For adaboost
library("gbm")
library("tree") # For decision trees
library("randomForest") # For... random forests

The regular boiler plate stuff.

1. Data and previous models

1.1 Data input

We’re making a simple classification problem out of the netflow data: can we tell what is HTTP?

Load the saved data for test/train split:

test=read.csv(file=file.path("..","data","conndataC_test.csv"),row.names=1)
train=read.csv(file=file.path("..","data","conndataC_train.csv"),row.names=1)

1.2 Block05 code

Here we have duplicated the Block05 models for convenience.

1.2.1 Learnmodel

require("pROC")
learnmodel=function(modelclass,formula,train,test, 
                    predictfn=function(x,newdata)predict(x,newdata,type="response"),
                    ...){ 
  ## Start by sorting out the formula target
  if(class(formula)=="character") formula=formula(formula)
  y=as.character(formula[2])
  
  ## Now run the learning 
  model=modelclass(formula,data=train,...)
  
  ## Predict on training data
  trainpred0=predictfn(model,newdata=train)
  trainpred=ifelse(trainpred0 > 0.5,1,0)
  
  ## Predict on testing data
  testpred0=predictfn(model,newdata=test)
  testpred=ifelse(testpred0 > 0.5,1,0)
  
  ## Organise the data for return
  trainres=data.frame(truth=train[,y],pred=trainpred,pred0=trainpred0)
  testres=data.frame(truth=test[,y],pred=testpred,pred0=testpred0)
  
  ## Compute ROC
  testroc=roc(truth~pred0,data=as.data.frame(testres))
  list(model=model,
       trainres=trainres,
       testres=testres,
       train=train,
       test=test,
       roc=testroc)
}

1.2.2 Logistic regression

require("MASS")
logisticregression=function(formula,data,...){
  glm(formula,family=binomial(link='logit'),data=data,...)
}
testlogistic <- logisticregression(http ~.,data=train)

logisticmodel=learnmodel(logisticregression,http~.,train,test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

1.2.2 KNN

library("class")
knnclass=function(formula,data,k){
  ## knn does not provide the usual interface, so we define it from scratch here
  ## We need to know what the x and y parts of y~x are, and to store all the data and k
  if(class(formula)=="character") formula=formula(formula)
  y=as.character(formula[2])
  x=labels(terms(formula, data=data))
  ret=list(formula=formula,train=data,k=k,x=x,y=y)
  class(ret)="knnclass"
  ret
}
predict.knnclass=function(x,newdata,...){
  ## knn can now be run on the new data. It returns the results as a factor with attributes "pr" where the probability of that classification is made. So we have to transform this into a probability.
  predtmp=knn(x$train[,x$x], newdata[,x$x], x$train[,x$y], k = x$k, prob=TRUE)
  pred0=attr(predtmp,"pr")
  pred=as.numeric(predtmp)-1
  pred0[pred==0] = 1-pred0[pred==0]
  pred0
}
## use the new functions
knnmodel=learnmodel(knnclass,http~.,train,test,k=5,predictfn=predict.knnclass)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

1.2.3 LDA

predictlda=function(x,newdata,...){
  ## The lda class hides the scores in the posterior
  predict(x,newdata,type="response")$posterior[,1]
}
ldamodel=learnmodel(lda,http~.,train,test,predictfn=predictlda)
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases

1.2.4 Support Vector Machine

library(e1071)
svmmodel=learnmodel(svm,http~.,train,test,kernel="linear")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
svmrmodel=learnmodel(svm,http~.,train,test,kernel="radial")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

1.2.5 Linear Stacking

lintrain=cbind(train,
               data.frame(logistic=logisticmodel$trainres$pred0,
                    lda=ldamodel$trainres$pred0,
                    svm=svmmodel$trainres$pred0))
lintest=cbind(test,
               data.frame(logistic=logisticmodel$testres$pred0,
                    lda=ldamodel$testres$pred0,
                    svm=svmmodel$testres$pred0))
linstackmodel=learnmodel(logisticregression,http~.,lintrain,lintest)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

1.2.6 Boosting

library("fastAdaboost")
predictada=function(x,newdata){
  predict(x,newdata=newdata,type='response')$prob[,2]
}
adamodel=learnmodel(adaboost,http~.,train,test,predictfn=predictada,nIter=10)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

2. Trees and Forests

2.1 Tree model using the function “tree”.

To figure this out from scratch, we would use ?tree and examine whether it returns the expected structure, or whether we have to define a custom model learning function. Do we need a special predictfn?

require("tree")
treemodel=learnmodel(tree,http~.,train,test,predictfn=predict)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

2.2 Random Forest model using the function “randomForest”.

We would need ?randomForest and should examine whether it returns the expected structure, or whether we have to define a custom model learning function. Do we need a special predictfn?

require("randomForest")
rfmodel=learnmodel(randomForest,http~.,train,test,predictfn=predict)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

2.3 Gradient Boosting Machine using the function “gbm”.

This fits a decision tree using boosting, though differently implemented from adaboost. We need ?gbm to examine whether it returns the expected structure, or whether we have to define a custom model learning function. Do we need a special predictfn?

We fit 3 models, for a tree depth of 1 ( the default), 2 and 3.

require(gbm)
gbmpredict=function(gbmmodel,newdata){
  predict(gbmmodel,newdata,n.trees=gbmmodel$n.trees)
}
gbmmodel1=learnmodel(gbm,http~.,train,test,predictfn=gbmpredict,n.trees=1000, interaction.depth=1)
## Distribution not specified, assuming bernoulli ...
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
gbmmodel2=learnmodel(gbm,http~.,train,test,predictfn=gbmpredict,n.trees=1000, interaction.depth=2)
## Distribution not specified, assuming bernoulli ...
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
gbmmodel3=learnmodel(gbm,http~.,train,test,predictfn=gbmpredict,n.trees=1000, interaction.depth=3)
## Distribution not specified, assuming bernoulli ...
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

2.4 Stacking everything

stacktrain=cbind(train,
               data.frame(logistic=logisticmodel$trainres$pred0,
                    lda=ldamodel$trainres$pred0,
                    svm=svmmodel$trainres$pred0,
                    svmr=svmrmodel$trainres$pred0,
                    knn=knnmodel$trainres$pred0,
                    ada=adamodel$trainres$pred0,
                    tree=treemodel$trainres$pred0,
                    rf=rfmodel$trainres$pred0,
                    gbm1=gbmmodel1$trainres$pred0,
                    gbm2=gbmmodel2$trainres$pred0,
                    gbm3=gbmmodel3$trainres$pred0
                    ))
stacktest=cbind(test,
               data.frame(logistic=logisticmodel$testres$pred0,
                    lda=ldamodel$testres$pred0,
                    svm=svmmodel$testres$pred0,
                    svmr=svmrmodel$testres$pred0,
                    knn=knnmodel$testres$pred0,
                    ada=adamodel$testres$pred0,
                    tree=treemodel$testres$pred0,
                    rf=rfmodel$testres$pred0,
                    gbm1=gbmmodel1$testres$pred0,
                    gbm2=gbmmodel2$testres$pred0,
                    gbm3=gbmmodel3$testres$pred0
                    ))
stackmodel=learnmodel(randomForest,http~.,stacktrain,stacktest)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values.  Are you sure you want to do regression?
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

3. Interpretation

3.1 Feature importances

The feature importances from the stacked model can be extracted in order by running:

stackmodel$model$importance[order(stackmodel$model$importance,decreasing = TRUE),]
##          gbm3           ada            rf          gbm2          gbm1 
##  1.361137e+02  1.016857e+02  8.268975e+01  5.789160e+01  2.493518e+01 
##           knn          tree          svmr      logistic           lda 
##  2.199674e+01  1.050825e+01  1.032414e+01  1.588043e+00  1.196600e+00 
##    resp_bytes    orig_bytes orig_ip_bytes      duration           svm 
##  1.113968e+00  3.658646e-02  9.073436e-03  7.866117e-03  3.009341e-03 
## resp_ip_bytes 
##  2.715205e-03

3.2 ROC curves.

The code below plots the new boosting and stacking ROC curves. How do they perform? Why do you think that the performance is ordered as we see?

require("pROC")

par(mfrow=c(1,3))
for(xmax in c(1,0.05,0.005)){
    plot(1-logisticmodel$roc$specificities,logisticmodel$roc$sensitivities,type="l",
         xlab="False Positive Rate",ylab="True Positive Rate",xlim=c(0,xmax),col=2)
    lines(1-ldamodel$roc$specificities,ldamodel$roc$sensitivities,col=3)
    lines(1-svmmodel$roc$specificities,svmmodel$roc$sensitivities,col=4)
    lines(1-svmrmodel$roc$specificities,svmrmodel$roc$sensitivities,col=4,lty=2)
    lines(1-knnmodel$roc$specificities,knnmodel$roc$sensitivities,col=5,lty=2)
    lines(1-linstackmodel$roc$specificities,linstackmodel$roc$sensitivities,col=6,lty=2)
    lines(1-adamodel$roc$specificities,adamodel$roc$sensitivities,col=7,lty=2)
    lines(1-treemodel$roc$specificities,treemodel$roc$sensitivities,col=8,lty=2,lwd=2)
    lines(1-rfmodel$roc$specificities,rfmodel$roc$sensitivities,col=9,lty=2,lwd=2)
    
    lines(1-gbmmodel1$roc$specificities,gbmmodel1$roc$sensitivities,col=1,lty=3,lwd=2)
    lines(1-gbmmodel2$roc$specificities,gbmmodel2$roc$sensitivities,col=2,lty=3,lwd=2)
    lines(1-gbmmodel3$roc$specificities,gbmmodel3$roc$sensitivities,col=3,lty=3,lwd=2)
    
    lines(1-stackmodel$roc$specificities,stackmodel$roc$sensitivities,col=2,lty=1,lwd=3)
    
    abline(a=0,b=1)
    if(xmax==1){
        legend("bottomright",c("Logistic","LDA","SVM (linear)",
                               "SVM (radial)","K-NN","Stacked linear methods",
                               "Adaboost","Tree","Random Forest",
                               "GBM1","GBM2","GBM3",
                               "Stacked"),
               text.col=c(2,3,4,4,5,6,7,8,9,1,2,3,2),
               col     =c(2,3,4,4,5,6,7,8,9,1,2,3,2),
               lty     =c(1,1,1,2,1,2,2,2,2,3,3,3,1),
               lwd     =c(1,1,1,1,1,1,1,2,2,2,2,2,3),bty="n",cex=0.75)
    }
}

See Q6.1 in Block 6 Portfolio

about interpreting these results.

3.2 Save the data in a format we can easily read into python. We will be interested in comparing ROC for the Random Forest.

write.csv(train,file="conndataC_train.csv")
write.csv(test,file="conndataC_test.csv")
write.csv(data.frame(fpr=1-rfmodel$roc$specificities,tpr=rfmodel$roc$sensitivities),file="conndataC_RFroc.csv")