This document is available via html here: https://rpubs.com/JRobertsDS/Prediction
For this project, we were supplied measurements from subjects while they were doing exercises, classified into five categories depending on the manner in which the subjects were performing the exercises. Our goal was to predict which categories were most likely to apply to 20 new sets of measurements, which were also supplied.
Steps taken:
Random Forest models do not require cross validation because there is randomness built into the forest. The model predicts out of sample error to be less than 1%, and we verified this via subsetting the training set, making smaller models, and computing their accuracy via other subsets of the training data (not shown). The final model uses the entire training data set.
set.seed (1234)
#download.file ("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv", "pml-training.csv", method = "curl")
#download.file ("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv", "pml-testing.csv", method = "curl")
rawTraining <- read.csv ("pml-training.csv")
rawTesting <- read.csv ("pml-testing.csv")
str (rawTraining[, 1:10])
## 'data.frame': 19622 obs. of 10 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ user_name : Factor w/ 6 levels "adelmo","carlitos",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ raw_timestamp_part_1: int 1323084231 1323084231 1323084231 1323084232 1323084232 1323084232 1323084232 1323084232 1323084232 1323084232 ...
## $ raw_timestamp_part_2: int 788290 808298 820366 120339 196328 304277 368296 440390 484323 484434 ...
## $ cvtd_timestamp : Factor w/ 20 levels "02/12/2011 13:32",..: 9 9 9 9 9 9 9 9 9 9 ...
## $ new_window : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ num_window : int 11 11 11 12 12 12 12 12 12 12 ...
## $ roll_belt : num 1.41 1.41 1.42 1.48 1.48 1.45 1.42 1.42 1.43 1.45 ...
## $ pitch_belt : num 8.07 8.07 8.07 8.05 8.07 8.06 8.09 8.13 8.16 8.17 ...
## $ yaw_belt : num -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 ...
head (names (rawTraining), 40)
## [1] "X" "user_name" "raw_timestamp_part_1"
## [4] "raw_timestamp_part_2" "cvtd_timestamp" "new_window"
## [7] "num_window" "roll_belt" "pitch_belt"
## [10] "yaw_belt" "total_accel_belt" "kurtosis_roll_belt"
## [13] "kurtosis_picth_belt" "kurtosis_yaw_belt" "skewness_roll_belt"
## [16] "skewness_roll_belt.1" "skewness_yaw_belt" "max_roll_belt"
## [19] "max_picth_belt" "max_yaw_belt" "min_roll_belt"
## [22] "min_pitch_belt" "min_yaw_belt" "amplitude_roll_belt"
## [25] "amplitude_pitch_belt" "amplitude_yaw_belt" "var_total_accel_belt"
## [28] "avg_roll_belt" "stddev_roll_belt" "var_roll_belt"
## [31] "avg_pitch_belt" "stddev_pitch_belt" "var_pitch_belt"
## [34] "avg_yaw_belt" "stddev_yaw_belt" "var_yaw_belt"
## [37] "gyros_belt_x" "gyros_belt_y" "gyros_belt_z"
## [40] "accel_belt_x"
hist (as.numeric (rawTraining$classe))
colsToRemove <- c("avg", "stddev", "max", "min", "var", "raw", "user", "X", "time", "kurtosis", "skewness", "amplitude_yaw")
training <- select (rawTraining, -contains (colsToRemove))
testing <- select (rawTesting, -contains (colsToRemove))
preTrain <- preProcess (training, method = c("center", "scale", "nzv", "knnImpute"))
trainData <- predict (preTrain, training)
testData <- predict (preTrain, testing)
forestFit <- readRDS ("forestFit.RDS") # read manually cached results, so they don't have to be re-computed each Knit
#forestFit <- train (classe ~ ., method = "rf", verbose = FALSE, data = trainData, na.action = na.omit)
# saveRDS (forestFit, file="forestFit.RDS")
preTrain
## Created from 406 samples and 51 variables
##
## Pre-processing:
## - centered (46)
## - ignored (2)
## - 5 nearest neighbor imputation (46)
## - removed (3)
## - scaled (46)
#forestFit$finalModel
# In band accuracy
forestTrainPredictions <- predict (forestFit, trainData)
forestTrainAccuracy <- (sum (trainData$classe == forestTrainPredictions)) / length (forestTrainPredictions)
forestTrainAccuracy
## [1] 1
# Variable Importance
forestImp <- varImp (forestFit)
plot (forestImp)
These are the predictions generated on the test data:
# forestTestPredictions [1] B A B A A E D B A A B C B A E E A B B B
forestTestPredictions <- predict (forestFit, testData)
forestTestPredictions
## [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E
Data source: http://web.archive.org/web/20161224072740/http:/groupware.les.inf.puc-rio.br/har