This is Block 6 Workshop (which is 6.2) part 1. Part 2 is in Python.
This adds trees and forests to our experimentations from block 5 on classification.
We implement a Random Forest in R, and save some data for comparison to python. This uses the same representation as we used in block05.
if(!require("gbm")) install.packages("gbm")
## Loading required package: gbm
## Loaded gbm 2.1.8.1
if(!require("knitr")) install.packages("knitr")
## Loading required package: knitr
if(!require("MASS")) install.packages("MASS")
## Loading required package: MASS
if(!require("class")) install.packages("class")
## Loading required package: class
if(!require("e1071")) install.packages("e1071")
## Loading required package: e1071
if(!require("pROC")) install.packages("pROC")
## Loading required package: pROC
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
if(!require("devtools")) install.packages("devtools")
## Loading required package: devtools
## Loading required package: usethis
if(!require("tree")) install.packages("tree")
## Loading required package: tree
if(!require("randomForest")) install.packages("randomForest")
## Loading required package: randomForest
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library("devtools") # for install_github
if(!require("fastAdaboost")) devtools::install_github("souravc83/fastAdaboost")
## Loading required package: fastAdaboost
library("knitr") #
library("MASS") # For logistic regression
library("class") # For knn
library("e1071") # For SVM
library("pROC") # For ROCs
library("fastAdaboost") # For adaboost
library("gbm")
library("tree") # For decision trees
library("randomForest") # For... random forests
We’re making a simple classification problem out of the netflow data: can we tell what is HTTP?
Load the saved data for test/train split:
test=read.csv(file=file.path("..","data","conndataC_test.csv"),row.names=1)
train=read.csv(file=file.path("..","data","conndataC_train.csv"),row.names=1)
Here we have duplicated the Block05 models for convenience.
require("pROC")
learnmodel=function(modelclass,formula,train,test,
predictfn=function(x,newdata)predict(x,newdata,type="response"),
...){
## Start by sorting out the formula target
if(class(formula)=="character") formula=formula(formula)
y=as.character(formula[2])
## Now run the learning
model=modelclass(formula,data=train,...)
## Predict on training data
trainpred0=predictfn(model,newdata=train)
trainpred=ifelse(trainpred0 > 0.5,1,0)
## Predict on testing data
testpred0=predictfn(model,newdata=test)
testpred=ifelse(testpred0 > 0.5,1,0)
## Organise the data for return
trainres=data.frame(truth=train[,y],pred=trainpred,pred0=trainpred0)
testres=data.frame(truth=test[,y],pred=testpred,pred0=testpred0)
## Compute ROC
testroc=roc(truth~pred0,data=as.data.frame(testres))
list(model=model,
trainres=trainres,
testres=testres,
train=train,
test=test,
roc=testroc)
}
require("MASS")
logisticregression=function(formula,data,...){
glm(formula,family=binomial(link='logit'),data=data,...)
}
testlogistic <- logisticregression(http ~.,data=train)
logisticmodel=learnmodel(logisticregression,http~.,train,test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
library("class")
knnclass=function(formula,data,k){
## knn does not provide the usual interface, so we define it from scratch here
## We need to know what the x and y parts of y~x are, and to store all the data and k
if(class(formula)=="character") formula=formula(formula)
y=as.character(formula[2])
x=labels(terms(formula, data=data))
ret=list(formula=formula,train=data,k=k,x=x,y=y)
class(ret)="knnclass"
ret
}
predict.knnclass=function(x,newdata,...){
## knn can now be run on the new data. It returns the results as a factor with attributes "pr" where the probability of that classification is made. So we have to transform this into a probability.
predtmp=knn(x$train[,x$x], newdata[,x$x], x$train[,x$y], k = x$k, prob=TRUE)
pred0=attr(predtmp,"pr")
pred=as.numeric(predtmp)-1
pred0[pred==0] = 1-pred0[pred==0]
pred0
}
## use the new functions
knnmodel=learnmodel(knnclass,http~.,train,test,k=5,predictfn=predict.knnclass)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
predictlda=function(x,newdata,...){
## The lda class hides the scores in the posterior
predict(x,newdata,type="response")$posterior[,1]
}
ldamodel=learnmodel(lda,http~.,train,test,predictfn=predictlda)
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases
library(e1071)
svmmodel=learnmodel(svm,http~.,train,test,kernel="linear")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
svmrmodel=learnmodel(svm,http~.,train,test,kernel="radial")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
lintrain=cbind(train,
data.frame(logistic=logisticmodel$trainres$pred0,
lda=ldamodel$trainres$pred0,
svm=svmmodel$trainres$pred0))
lintest=cbind(test,
data.frame(logistic=logisticmodel$testres$pred0,
lda=ldamodel$testres$pred0,
svm=svmmodel$testres$pred0))
linstackmodel=learnmodel(logisticregression,http~.,lintrain,lintest)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
library("fastAdaboost")
predictada=function(x,newdata){
predict(x,newdata=newdata,type='response')$prob[,2]
}
adamodel=learnmodel(adaboost,http~.,train,test,predictfn=predictada,nIter=10)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
To figure this out from scratch, we would use ?tree
and
examine whether it returns the expected structure, or whether we have to
define a custom model learning function. Do we need a special
predictfn
?
require("tree")
treemodel=learnmodel(tree,http~.,train,test,predictfn=predict)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
We would need ?randomForest
and should examine whether
it returns the expected structure, or whether we have to define a custom
model learning function. Do we need a special predictfn?
require("randomForest")
rfmodel=learnmodel(randomForest,http~.,train,test,predictfn=predict)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
This fits a decision tree using boosting, though differently
implemented from adaboost
. We need ?gbm
to
examine whether it returns the expected structure, or whether we have to
define a custom model learning function. Do we need a special
predictfn
?
We fit 3 models, for a tree depth of 1 ( the default), 2 and 3.
require(gbm)
gbmpredict=function(gbmmodel,newdata){
predict(gbmmodel,newdata,n.trees=gbmmodel$n.trees)
}
gbmmodel1=learnmodel(gbm,http~.,train,test,predictfn=gbmpredict,n.trees=1000, interaction.depth=1)
## Distribution not specified, assuming bernoulli ...
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
gbmmodel2=learnmodel(gbm,http~.,train,test,predictfn=gbmpredict,n.trees=1000, interaction.depth=2)
## Distribution not specified, assuming bernoulli ...
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
gbmmodel3=learnmodel(gbm,http~.,train,test,predictfn=gbmpredict,n.trees=1000, interaction.depth=3)
## Distribution not specified, assuming bernoulli ...
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
stacktrain=cbind(train,
data.frame(logistic=logisticmodel$trainres$pred0,
lda=ldamodel$trainres$pred0,
svm=svmmodel$trainres$pred0,
svmr=svmrmodel$trainres$pred0,
knn=knnmodel$trainres$pred0,
ada=adamodel$trainres$pred0,
tree=treemodel$trainres$pred0,
rf=rfmodel$trainres$pred0,
gbm1=gbmmodel1$trainres$pred0,
gbm2=gbmmodel2$trainres$pred0,
gbm3=gbmmodel3$trainres$pred0
))
stacktest=cbind(test,
data.frame(logistic=logisticmodel$testres$pred0,
lda=ldamodel$testres$pred0,
svm=svmmodel$testres$pred0,
svmr=svmrmodel$testres$pred0,
knn=knnmodel$testres$pred0,
ada=adamodel$testres$pred0,
tree=treemodel$testres$pred0,
rf=rfmodel$testres$pred0,
gbm1=gbmmodel1$testres$pred0,
gbm2=gbmmodel2$testres$pred0,
gbm3=gbmmodel3$testres$pred0
))
stackmodel=learnmodel(randomForest,http~.,stacktrain,stacktest)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
The feature importances from the stacked model can be extracted in order by running:
stackmodel$model$importance[order(stackmodel$model$importance,decreasing = TRUE),]
## gbm3 ada rf gbm2 gbm1
## 1.361137e+02 1.016857e+02 8.268975e+01 5.789160e+01 2.493518e+01
## knn tree svmr logistic lda
## 2.199674e+01 1.050825e+01 1.032414e+01 1.588043e+00 1.196600e+00
## resp_bytes orig_bytes orig_ip_bytes duration svm
## 1.113968e+00 3.658646e-02 9.073436e-03 7.866117e-03 3.009341e-03
## resp_ip_bytes
## 2.715205e-03
The code below plots the new boosting and stacking ROC curves. How do they perform? Why do you think that the performance is ordered as we see?
require("pROC")
par(mfrow=c(1,3))
for(xmax in c(1,0.05,0.005)){
plot(1-logisticmodel$roc$specificities,logisticmodel$roc$sensitivities,type="l",
xlab="False Positive Rate",ylab="True Positive Rate",xlim=c(0,xmax),col=2)
lines(1-ldamodel$roc$specificities,ldamodel$roc$sensitivities,col=3)
lines(1-svmmodel$roc$specificities,svmmodel$roc$sensitivities,col=4)
lines(1-svmrmodel$roc$specificities,svmrmodel$roc$sensitivities,col=4,lty=2)
lines(1-knnmodel$roc$specificities,knnmodel$roc$sensitivities,col=5,lty=2)
lines(1-linstackmodel$roc$specificities,linstackmodel$roc$sensitivities,col=6,lty=2)
lines(1-adamodel$roc$specificities,adamodel$roc$sensitivities,col=7,lty=2)
lines(1-treemodel$roc$specificities,treemodel$roc$sensitivities,col=8,lty=2,lwd=2)
lines(1-rfmodel$roc$specificities,rfmodel$roc$sensitivities,col=9,lty=2,lwd=2)
lines(1-gbmmodel1$roc$specificities,gbmmodel1$roc$sensitivities,col=1,lty=3,lwd=2)
lines(1-gbmmodel2$roc$specificities,gbmmodel2$roc$sensitivities,col=2,lty=3,lwd=2)
lines(1-gbmmodel3$roc$specificities,gbmmodel3$roc$sensitivities,col=3,lty=3,lwd=2)
lines(1-stackmodel$roc$specificities,stackmodel$roc$sensitivities,col=2,lty=1,lwd=3)
abline(a=0,b=1)
if(xmax==1){
legend("bottomright",c("Logistic","LDA","SVM (linear)",
"SVM (radial)","K-NN","Stacked linear methods",
"Adaboost","Tree","Random Forest",
"GBM1","GBM2","GBM3",
"Stacked"),
text.col=c(2,3,4,4,5,6,7,8,9,1,2,3,2),
col =c(2,3,4,4,5,6,7,8,9,1,2,3,2),
lty =c(1,1,1,2,1,2,2,2,2,3,3,3,1),
lwd =c(1,1,1,1,1,1,1,2,2,2,2,2,3),bty="n",cex=0.75)
}
}
about interpreting these results.
write.csv(train,file="conndataC_train.csv")
write.csv(test,file="conndataC_test.csv")
write.csv(data.frame(fpr=1-rfmodel$roc$specificities,tpr=rfmodel$roc$sensitivities),file="conndataC_RFroc.csv")