统计机器学习的实验设计
Caret包简介
北京市出租车数据案例
Caret是一个R中统计机器学习算法包的集成平台。里面包含了147个各种不同的统计机器学习方法与模型。
Caret包自动化的完成数据的交叉验证与参数选择过程。
Caret包支持平行计算功能,大大提高计算效率。
Data Splitting (Train, Test, Sampling)
Preprocessing Data (Clear, Standardize, Transform, Impute)
Training and Testing Approaches (Train and Predict)
Model Comparison (Cross-validation, Confusion Matrix)
哪些状态的出租车在下一时刻更容易非空?
哪些指标与出租车下一时刻是否非空相关?
我们能够给出出租车下一时刻非空可能性的定量化估计么?
-state:五分钟内出租车是否为空(有载客状态则视为非空)
after <- read.csv("final_data.csv")
head(after, 3)
Car_ID Hour five state speed_ave speed_sd disNam rush1 rush2 stateBef
1 100164 0 5 0 0 0 dongcheng FALSE FALSE 0
2 100164 0 6 0 0 0 dongcheng FALSE FALSE 0
3 100164 0 7 0 0 0 dongcheng FALSE FALSE 0
logspeed_ave logspeed_sd ave1 sd1 chongwen haidian chaoyang
1 0 0 -2.179164 -2.248029 FALSE FALSE FALSE
2 0 0 -2.179164 -2.248029 FALSE FALSE FALSE
3 0 0 -2.179164 -2.248029 FALSE FALSE FALSE
dongcheng xicheng xuanwu
1 TRUE FALSE FALSE
2 TRUE FALSE FALSE
3 TRUE FALSE FALSE
set.seed(2017) #设计随机数种子
sample <- sample(dim(after)[1],round(0.8*dim(after)[1])) #产生训练样本的编号
trainset <- after[sample,] #构建训练集
testset <- after[-sample,] #构建测试集
fit1 <- glm(state~ave1+sd1+chongwen+haidian+chaoyang+dongcheng
+xicheng+xuanwu+rush1+rush2+stateBef,family="binomial", data = trainset) #逻辑回归
p1 <- predict(fit1,testset) #测试集合进行预测
score1 <- exp(p1)/(1+exp(p1)) #转化成概率
pstate1 <- rep(0, length(p1)) #选择0.5作为阈值
pstate1[score1>=0.5] <- 1 #赋值
table(pstate1, testset$state)/length(p1) # 构建混淆矩阵
pstate1 0 1
0 0.26887770 0.07047675
1 0.05537459 0.60527095
accuracy1 <- sum(diag(table(pstate1,testset$state)/length(p1))) #正确率
accuracy1
[1] 0.8741487
# EDA
plot(density(trainset$speed_ave[trainset$state == 0]),
col = "red", main = "", xlab = "Frequence of 'ave'")
lines(density(trainset$speed_ave[trainset$state == 1]), col = "blue")
abline(v = 7, col = "black")
pstate2 <- ifelse(testset$speed_ave>7, 1, 0)
table(pstate2, testset$state)/length(pstate2)
pstate2 0 1
0 0.1539828 0.0740302
1 0.1702695 0.6017175
accuracy2 <- sum(diag(table(pstate2,testset$state)/length(pstate2)))
accuracy2
[1] 0.7557003
#install.packages("caret", dependencies = c("Depends", "Suggests"))
library("lattice")
library("ggplot2")
library("caret")
set.seed(2017)
inTrain <- createDataPartition(y = after$state, p = 0.8, list = FALSE) #重要的数据划分函数
trainset <- after[inTrain, ]
testset <- after[-inTrain, ]
dim(trainset)
[1] 13510 20
dim(testset)
[1] 3377 20
folds <- createFolds(y = after$state, k = 5, list = TRUE, returnTrain = TRUE)
sapply(folds, length)
Fold1 Fold2 Fold3 Fold4 Fold5
13509 13510 13510 13510 13509
folds$Fold1[1:10]
[1] 1 2 3 4 5 6 7 8 9 10
– Try createResample()
and createTimeSlices()
.
mean(after$speed_ave)
> [1] 22.84735
sd(after$speed_ave)
> [1] 18.18804
train_ave <- (trainset$speed_ave - mean(trainset$speed_ave))/sd(trainset$speed_ave)
mean(train_ave)
> [1] -7.893213e-17
sd(train_ave)
> [1] 1
preObj <- preProcess(trainset[,-7], method = c("center", "scale")) #数据预处理函数
speed_ave <- predict(preObj, trainset[,-7])$speed_ave
mean(speed_ave)
[1] -7.893213e-17
sd(speed_ave)
[1] 1
– Try other methods, e.g., impute
, cox-box trnsform
.
fit3 <- train(factor(state)~ave1+sd1+chongwen+haidian
+chaoyang+dongcheng+xicheng+xuanwu+rush1+rush2+stateBef, data = trainset, method = "glm",family="binomial") #训练函数
pstate3 <- predict(fit3, newdata = testset) #预测函数
confusionMatrix(pstate3, factor(testset$state)) #混淆矩阵
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 908 238
1 187 2044
Accuracy : 0.8741
95% CI : (0.8625, 0.8852)
No Information Rate : 0.6757
P-Value [Acc > NIR] : < 2e-16
Kappa : 0.7163
Mcnemar's Test P-Value : 0.01529
Sensitivity : 0.8292
Specificity : 0.8957
Pos Pred Value : 0.7923
Neg Pred Value : 0.9162
Prevalence : 0.3243
Detection Rate : 0.2689
Detection Prevalence : 0.3394
Balanced Accuracy : 0.8625
'Positive' Class : 0
fit4 <- train(factor(state)~ave1+sd1+chongwen+haidian+chaoyang
+dongcheng+xicheng+xuanwu+rush1+rush2+stateBef, data = trainset,method = "rpart") #训练函数
pstate4 <- predict(fit4, newdata = testset) #预测函数
confusionMatrix(pstate4, factor(testset$state))
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 915 235
1 180 2047
Accuracy : 0.8771
95% CI : (0.8656, 0.888)
No Information Rate : 0.6757
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.7232
Mcnemar's Test P-Value : 0.008031
Sensitivity : 0.8356
Specificity : 0.8970
Pos Pred Value : 0.7957
Neg Pred Value : 0.9192
Prevalence : 0.3243
Detection Rate : 0.2710
Detection Prevalence : 0.3405
Balanced Accuracy : 0.8663
'Positive' Class : 0
library(rattle)
fancyRpartPlot(fit4$finalModel)
Try plot(fit4$finalModel)
fit5 <- train(factor(state)~ave1+sd1+chongwen+haidian+chaoyang
+dongcheng+xicheng+xuanwu+rush1+rush2+stateBef, data = trainset, method = "treebag")
pstate5 <- predict(fit5, newdata = testset)
confusionMatrix(pstate5, factor(testset$state))
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 832 230
1 263 2052
Accuracy : 0.854
95% CI : (0.8416, 0.8658)
No Information Rate : 0.6757
P-Value [Acc > NIR] : <2e-16
Kappa : 0.6642
Mcnemar's Test P-Value : 0.1495
Sensitivity : 0.7598
Specificity : 0.8992
Pos Pred Value : 0.7834
Neg Pred Value : 0.8864
Prevalence : 0.3243
Detection Rate : 0.2464
Detection Prevalence : 0.3145
Balanced Accuracy : 0.8295
'Positive' Class : 0
# Boosting
fit6 <- train(factor(state)~ave1+sd1+chongwen+haidian+chaoyang
+dongcheng+xicheng+xuanwu+rush1+rush2+stateBef, data = trainset, method = "gam")
pstate6 <- predict(fit6, newdata = testset)
confusionMatrix(pstate6, factor(testset$state))
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 918 249
1 177 2033
Accuracy : 0.8739
95% CI : (0.8622, 0.8849)
No Information Rate : 0.6757
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.717
Mcnemar's Test P-Value : 0.0005818
Sensitivity : 0.8384
Specificity : 0.8909
Pos Pred Value : 0.7866
Neg Pred Value : 0.9199
Prevalence : 0.3243
Detection Rate : 0.2718
Detection Prevalence : 0.3456
Balanced Accuracy : 0.8646
'Positive' Class : 0
# Random Forest
fit7 <- train(factor(state)~ave1+sd1+chongwen+haidian+chaoyang
+dongcheng+xicheng+xuanwu+rush1+rush2+stateBef, data = trainset, method = "rf")
pstate7 <- predict(fit7, newdata = testset)
confusionMatrix(pstate7, factor(testset$state))
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 916 236
1 179 2046
Accuracy : 0.8771
95% CI : (0.8656, 0.888)
No Information Rate : 0.6757
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.7233
Mcnemar's Test P-Value : 0.005979
Sensitivity : 0.8365
Specificity : 0.8966
Pos Pred Value : 0.7951
Neg Pred Value : 0.9196
Prevalence : 0.3243
Detection Rate : 0.2712
Detection Prevalence : 0.3411
Balanced Accuracy : 0.8666
'Positive' Class : 0
combPre <- data.frame(pstate3, pstate4, pstate5, pstate6, pstate7, state = testset$state) #集成
combfit <- train(factor(state)~., method = "gam", data = combPre)
combpstate <- predict(combfit, newdata = testset)
confusionMatrix(combpstate, factor(testset$state))
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 914 235
1 181 2047
Accuracy : 0.8768
95% CI : (0.8653, 0.8877)
No Information Rate : 0.6757
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.7225
Mcnemar's Test P-Value : 0.009362
Sensitivity : 0.8347
Specificity : 0.8970
Pos Pred Value : 0.7955
Neg Pred Value : 0.9188
Prevalence : 0.3243
Detection Rate : 0.2707
Detection Prevalence : 0.3402
Balanced Accuracy : 0.8659
'Positive' Class : 0
Thank you