Executive Summary

This document is available via html here: https://rpubs.com/JRobertsDS/Prediction

For this project, we were supplied measurements from subjects while they were doing exercises, classified into five categories depending on the manner in which the subjects were performing the exercises. Our goal was to predict which categories were most likely to apply to 20 new sets of measurements, which were also supplied.

Steps taken:

Random Forest models do not require cross validation because there is randomness built into the forest. The model predicts out of sample error to be less than 1%, and we verified this via subsetting the training set, making smaller models, and computing their accuracy via other subsets of the training data (not shown). The final model uses the entire training data set.

Set up for reproducibility

set.seed (1234)

Load Data

#download.file ("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv", "pml-training.csv", method = "curl")
#download.file ("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv", "pml-testing.csv", method = "curl")
rawTraining <- read.csv ("pml-training.csv")
rawTesting <- read.csv ("pml-testing.csv")

Exploratory Analysis

str (rawTraining[, 1:10])
## 'data.frame':    19622 obs. of  10 variables:
##  $ X                   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ user_name           : Factor w/ 6 levels "adelmo","carlitos",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ raw_timestamp_part_1: int  1323084231 1323084231 1323084231 1323084232 1323084232 1323084232 1323084232 1323084232 1323084232 1323084232 ...
##  $ raw_timestamp_part_2: int  788290 808298 820366 120339 196328 304277 368296 440390 484323 484434 ...
##  $ cvtd_timestamp      : Factor w/ 20 levels "02/12/2011 13:32",..: 9 9 9 9 9 9 9 9 9 9 ...
##  $ new_window          : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ num_window          : int  11 11 11 12 12 12 12 12 12 12 ...
##  $ roll_belt           : num  1.41 1.41 1.42 1.48 1.48 1.45 1.42 1.42 1.43 1.45 ...
##  $ pitch_belt          : num  8.07 8.07 8.07 8.05 8.07 8.06 8.09 8.13 8.16 8.17 ...
##  $ yaw_belt            : num  -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 ...
head (names (rawTraining), 40)
##  [1] "X"                    "user_name"            "raw_timestamp_part_1"
##  [4] "raw_timestamp_part_2" "cvtd_timestamp"       "new_window"          
##  [7] "num_window"           "roll_belt"            "pitch_belt"          
## [10] "yaw_belt"             "total_accel_belt"     "kurtosis_roll_belt"  
## [13] "kurtosis_picth_belt"  "kurtosis_yaw_belt"    "skewness_roll_belt"  
## [16] "skewness_roll_belt.1" "skewness_yaw_belt"    "max_roll_belt"       
## [19] "max_picth_belt"       "max_yaw_belt"         "min_roll_belt"       
## [22] "min_pitch_belt"       "min_yaw_belt"         "amplitude_roll_belt" 
## [25] "amplitude_pitch_belt" "amplitude_yaw_belt"   "var_total_accel_belt"
## [28] "avg_roll_belt"        "stddev_roll_belt"     "var_roll_belt"       
## [31] "avg_pitch_belt"       "stddev_pitch_belt"    "var_pitch_belt"      
## [34] "avg_yaw_belt"         "stddev_yaw_belt"      "var_yaw_belt"        
## [37] "gyros_belt_x"         "gyros_belt_y"         "gyros_belt_z"        
## [40] "accel_belt_x"
hist (as.numeric (rawTraining$classe))

Clean Data

colsToRemove <- c("avg", "stddev", "max", "min", "var", "raw", "user", "X", "time", "kurtosis", "skewness", "amplitude_yaw")
training <- select (rawTraining, -contains (colsToRemove))
testing <- select (rawTesting, -contains (colsToRemove))
preTrain <- preProcess (training, method = c("center", "scale", "nzv", "knnImpute"))
trainData <- predict (preTrain, training)
testData <- predict (preTrain, testing)

Compute Random Forest

forestFit <- readRDS ("forestFit.RDS") # read manually cached results, so they don't have to be re-computed each Knit
#forestFit <- train (classe ~ ., method = "rf", verbose = FALSE, data = trainData, na.action = na.omit)
#   saveRDS (forestFit, file="forestFit.RDS")

Explore the Random Forest

preTrain
## Created from 406 samples and 51 variables
## 
## Pre-processing:
##   - centered (46)
##   - ignored (2)
##   - 5 nearest neighbor imputation (46)
##   - removed (3)
##   - scaled (46)
#forestFit$finalModel

# In band accuracy
forestTrainPredictions <- predict (forestFit, trainData)
forestTrainAccuracy <- (sum (trainData$classe == forestTrainPredictions)) / length (forestTrainPredictions)
forestTrainAccuracy
## [1] 1
# Variable Importance
forestImp <- varImp (forestFit)
plot (forestImp)

Conclusion

These are the predictions generated on the test data:

# forestTestPredictions   [1] B A B A A E D B A A B C B A E E A B B B
forestTestPredictions <- predict (forestFit, testData) 
forestTestPredictions 
##  [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E

Data source: http://web.archive.org/web/20161224072740/http:/groupware.les.inf.puc-rio.br/har