统计机器学习的实验设计
Caret包简介
北京市出租车数据案例
Caret是一个R中统计机器学习算法包的集成平台。里面包含了147个各种不同的统计机器学习方法与模型。
Caret包自动化的完成数据的交叉验证与参数选择过程。
Caret包支持平行计算功能,大大提高计算效率。
Data Splitting (Train, Test, Sampling)
Preprocessing Data (Clear, Standardize, Transform, Impute)
Training and Testing Approaches (Train and Predict)
Model Comparison (Cross-validation, Confusion Matrix)
哪些状态的出租车在下一时刻更容易非空?
哪些指标与出租车下一时刻是否非空相关?
我们能够给出出租车下一时刻非空可能性的定量化估计么?
指标设计
after <- read.csv("final_data.csv")
head(after, 3)
## Car_ID Hour five state speed_ave speed_sd disNam rush1 rush2 stateBef
## 1 100164 0 5 0 0 0 dongcheng FALSE FALSE 0
## 2 100164 0 6 0 0 0 dongcheng FALSE FALSE 0
## 3 100164 0 7 0 0 0 dongcheng FALSE FALSE 0
## logspeed_ave logspeed_sd ave1 sd1 chongwen haidian chaoyang
## 1 0 0 -2.179164 -2.248029 FALSE FALSE FALSE
## 2 0 0 -2.179164 -2.248029 FALSE FALSE FALSE
## 3 0 0 -2.179164 -2.248029 FALSE FALSE FALSE
## dongcheng xicheng xuanwu
## 1 TRUE FALSE FALSE
## 2 TRUE FALSE FALSE
## 3 TRUE FALSE FALSE
set.seed(2017) #设计随机数种子
sample <- sample(dim(after)[1],round(0.8*dim(after)[1])) #产生训练样本的编号
trainset <- after[sample,] #构建训练集
testset <- after[-sample,] #构建测试集
fit1 <- glm(state~ave1+sd1+chongwen+haidian+chaoyang+dongcheng
+xicheng+xuanwu+rush1+rush2+stateBef,family="binomial", data = trainset) #逻辑回归
p1 <- predict(fit1,testset) #测试集合进行预测
score1 <- exp(p1)/(1+exp(p1)) #转化成概率
pstate1 <- rep(0, length(p1)) #选择0.5作为阈值
pstate1[score1>=0.5] <- 1 #赋值
table(pstate1, testset$state)/length(p1) # 构建混淆矩阵
##
## pstate1 0 1
## 0 0.26887770 0.07047675
## 1 0.05537459 0.60527095
accuracy1 <- sum(diag(table(pstate1,testset$state)/length(p1))) #正确率
accuracy1
## [1] 0.8741487
# EDA
plot(density(trainset$speed_ave[trainset$state == 0]),
col = "red", main = "", xlab = "Frequence of 'ave'")
lines(density(trainset$speed_ave[trainset$state == 1]), col = "blue")
abline(v = 7, col = "black")
pstate2 <- ifelse(testset$speed_ave>7, 1, 0)
table(pstate2, testset$state)/length(pstate2)
##
## pstate2 0 1
## 0 0.1539828 0.0740302
## 1 0.1702695 0.6017175
accuracy2 <- sum(diag(table(pstate2,testset$state)/length(pstate2)))
accuracy2
## [1] 0.7557003
#install.packages("caret", dependencies = c("Depends", "Suggests"))
library("lattice")
library("ggplot2")
library("caret")
## Warning: package 'caret' was built under R version 3.4.4
set.seed(2017)
inTrain <- createDataPartition(y = after$state, p = 0.8, list = FALSE) #重要的数据划分函数
trainset <- after[inTrain, ]
testset <- after[-inTrain, ]
dim(trainset)
## [1] 13510 20
dim(testset)
## [1] 3377 20
folds <- createFolds(y = after$state, k = 5, list = TRUE, returnTrain = TRUE)
sapply(folds, length)
## Fold1 Fold2 Fold3 Fold4 Fold5
## 13510 13510 13510 13509 13509
folds$Fold1[1:10]
## [1] 3 4 5 6 7 8 9 10 11 12
– Try createResample()
and createTimeSlices()
.
mean(after$speed_ave)
> [1] 22.84735
sd(after$speed_ave)
> [1] 18.18804
train_ave <- (trainset$speed_ave - mean(trainset$speed_ave))/sd(trainset$speed_ave)
mean(train_ave)
> [1] -7.893213e-17
sd(train_ave)
> [1] 1
preObj <- preProcess(trainset[,-7], method = c("center", "scale")) #数据预处理函数
speed_ave <- predict(preObj, trainset[,-7])$speed_ave
mean(speed_ave)
## [1] -7.893213e-17
sd(speed_ave)
## [1] 1
– Try other methods, e.g., impute
, cox-box trnsform
.
fit3 <- train(factor(state)~ave1+sd1+chongwen+haidian
+chaoyang+dongcheng+xicheng+xuanwu+rush1+rush2+stateBef, data = trainset, method = "glm",family="binomial") #训练函数
pstate3 <- predict(fit3, newdata = testset) #预测函数
confusionMatrix(pstate3, factor(testset$state)) #混淆矩阵
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 908 238
## 1 187 2044
##
## Accuracy : 0.8741
## 95% CI : (0.8625, 0.8852)
## No Information Rate : 0.6757
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.7163
## Mcnemar's Test P-Value : 0.01529
##
## Sensitivity : 0.8292
## Specificity : 0.8957
## Pos Pred Value : 0.7923
## Neg Pred Value : 0.9162
## Prevalence : 0.3243
## Detection Rate : 0.2689
## Detection Prevalence : 0.3394
## Balanced Accuracy : 0.8625
##
## 'Positive' Class : 0
##
fit4 <- train(factor(state)~ave1+sd1+chongwen+haidian+chaoyang
+dongcheng+xicheng+xuanwu+rush1+rush2+stateBef, data = trainset,method = "rpart") #训练函数
pstate4 <- predict(fit4, newdata = testset) #预测函数
confusionMatrix(pstate4, factor(testset$state))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 915 235
## 1 180 2047
##
## Accuracy : 0.8771
## 95% CI : (0.8656, 0.888)
## No Information Rate : 0.6757
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7232
## Mcnemar's Test P-Value : 0.008031
##
## Sensitivity : 0.8356
## Specificity : 0.8970
## Pos Pred Value : 0.7957
## Neg Pred Value : 0.9192
## Prevalence : 0.3243
## Detection Rate : 0.2710
## Detection Prevalence : 0.3405
## Balanced Accuracy : 0.8663
##
## 'Positive' Class : 0
##
Try plot(fit4$finalModel)
fit5 <- train(factor(state)~ave1+sd1+chongwen+haidian+chaoyang
+dongcheng+xicheng+xuanwu+rush1+rush2+stateBef, data = trainset, method = "treebag")
pstate5 <- predict(fit5, newdata = testset)
confusionMatrix(pstate5, factor(testset$state))
# Boosting
fit6 <- train(factor(state)~ave1+sd1+chongwen+haidian+chaoyang
+dongcheng+xicheng+xuanwu+rush1+rush2+stateBef, data = trainset, method = "gam")
pstate6 <- predict(fit6, newdata = testset)
confusionMatrix(pstate6, factor(testset$state))
# Random Forest
fit7 <- train(factor(state)~ave1+sd1+chongwen+haidian+chaoyang
+dongcheng+xicheng+xuanwu+rush1+rush2+stateBef, data = trainset, method = "rf")
pstate7 <- predict(fit7, newdata = testset)
confusionMatrix(pstate7, factor(testset$state))
combPre <- data.frame(pstate3, pstate4, pstate5, pstate6, pstate7, state = testset$state) #集成
combfit <- train(factor(state)~., method = "gam", data = combPre)
combpstate <- predict(combfit, newdata = testset)
confusionMatrix(combpstate, factor(testset$state))