1 基于R的机器学习

  • 统计机器学习的实验设计

  • Caret包简介

  • 北京市出租车数据案例

2 统计机器学习的实验设计

3 Caret

Caret参考手册

  • Caret是一个R中统计机器学习算法包的集成平台。里面包含了147个各种不同的统计机器学习方法与模型。

  • Caret包自动化的完成数据的交叉验证与参数选择过程。

  • Caret包支持平行计算功能,大大提高计算效率。

  • Data Splitting (Train, Test, Sampling)

  • Preprocessing Data (Clear, Standardize, Transform, Impute)

  • Training and Testing Approaches (Train and Predict)

  • Model Comparison (Cross-validation, Confusion Matrix)

4 北京市出租车数据案例

  • 哪些状态的出租车在下一时刻更容易非空?

  • 哪些指标与出租车下一时刻是否非空相关?

  • 我们能够给出出租车下一时刻非空可能性的定量化估计么?

4.1 Data

  • 样本来源:采用一天的出租车状态数据
  • 筛选运营状态:0=空车,1=载客,共计180188条
  • 按照五分钟汇总指标,最终数据共包含16887个观测(原因:数据采集不平均)

  • state:五分钟内出租车是否为空(有载客状态则视为非空)

指标设计

  1. 上一时刻是否是空车:五分钟内有载客状态则视为载客,否则为空车
  2. 速度平均值:五分钟内速度的平均值
  3. 速度标准差:五分钟内速度的标准差
  4. 是否是早高峰(rush1):7≤Hour≤9定义为1,否则为0.
  5. 是否是晚高峰 (rush2): 17≤Hour≤20定义为1,否则为0.
  6. 所在区域:计算五分钟内经纬度平均值并计算与以下区域的差距,选择最近的区域作为车辆五分钟内所在区域。
after <- read.csv("final_data.csv")
head(after, 3)
##   Car_ID Hour five state speed_ave speed_sd    disNam rush1 rush2 stateBef
## 1 100164    0    5     0         0        0 dongcheng FALSE FALSE        0
## 2 100164    0    6     0         0        0 dongcheng FALSE FALSE        0
## 3 100164    0    7     0         0        0 dongcheng FALSE FALSE        0
##   logspeed_ave logspeed_sd      ave1       sd1 chongwen haidian chaoyang
## 1            0           0 -2.179164 -2.248029    FALSE   FALSE    FALSE
## 2            0           0 -2.179164 -2.248029    FALSE   FALSE    FALSE
## 3            0           0 -2.179164 -2.248029    FALSE   FALSE    FALSE
##   dongcheng xicheng xuanwu
## 1      TRUE   FALSE  FALSE
## 2      TRUE   FALSE  FALSE
## 3      TRUE   FALSE  FALSE

4.2 Logistic Regression

set.seed(2017) #设计随机数种子

sample <- sample(dim(after)[1],round(0.8*dim(after)[1])) #产生训练样本的编号

trainset <- after[sample,] #构建训练集

testset <- after[-sample,] #构建测试集

fit1 <- glm(state~ave1+sd1+chongwen+haidian+chaoyang+dongcheng
+xicheng+xuanwu+rush1+rush2+stateBef,family="binomial", data = trainset) #逻辑回归

p1 <- predict(fit1,testset) #测试集合进行预测

score1 <- exp(p1)/(1+exp(p1)) #转化成概率

pstate1 <- rep(0, length(p1)) #选择0.5作为阈值

pstate1[score1>=0.5] <- 1 #赋值
table(pstate1, testset$state)/length(p1) # 构建混淆矩阵
##        
## pstate1          0          1
##       0 0.26887770 0.07047675
##       1 0.05537459 0.60527095
accuracy1 <- sum(diag(table(pstate1,testset$state)/length(p1))) #正确率

accuracy1
## [1] 0.8741487

4.3 Naive Method

# EDA
plot(density(trainset$speed_ave[trainset$state == 0]), 
     col = "red", main = "", xlab = "Frequence of 'ave'")
lines(density(trainset$speed_ave[trainset$state == 1]), col = "blue")
abline(v = 7, col = "black")

pstate2 <- ifelse(testset$speed_ave>7, 1, 0) 
table(pstate2, testset$state)/length(pstate2)
##        
## pstate2         0         1
##       0 0.1539828 0.0740302
##       1 0.1702695 0.6017175
accuracy2 <- sum(diag(table(pstate2,testset$state)/length(pstate2)))
accuracy2
## [1] 0.7557003

4.4 Preprocessing for ML

#install.packages("caret", dependencies = c("Depends", "Suggests"))
library("lattice")
library("ggplot2")
library("caret")
## Warning: package 'caret' was built under R version 3.4.4
set.seed(2017)
inTrain <- createDataPartition(y = after$state, p = 0.8, list = FALSE) #重要的数据划分函数
trainset <- after[inTrain, ]
testset <- after[-inTrain, ]
dim(trainset)
## [1] 13510    20
dim(testset)
## [1] 3377   20
folds <- createFolds(y = after$state, k = 5, list = TRUE, returnTrain = TRUE)
sapply(folds, length)
## Fold1 Fold2 Fold3 Fold4 Fold5 
## 13510 13510 13510 13509 13509
folds$Fold1[1:10]
##  [1]  3  4  5  6  7  8  9 10 11 12

– Try createResample() and createTimeSlices().

mean(after$speed_ave)
> [1] 22.84735
sd(after$speed_ave)
> [1] 18.18804
train_ave <- (trainset$speed_ave - mean(trainset$speed_ave))/sd(trainset$speed_ave)
mean(train_ave)
> [1] -7.893213e-17
sd(train_ave)
> [1] 1
preObj <- preProcess(trainset[,-7], method = c("center", "scale")) #数据预处理函数

speed_ave <- predict(preObj, trainset[,-7])$speed_ave

mean(speed_ave)
## [1] -7.893213e-17
sd(speed_ave)
## [1] 1

– Try other methods, e.g., impute, cox-box trnsform.

  • LR in Caret
fit3 <- train(factor(state)~ave1+sd1+chongwen+haidian
              +chaoyang+dongcheng+xicheng+xuanwu+rush1+rush2+stateBef, data = trainset, method = "glm",family="binomial") #训练函数

pstate3 <- predict(fit3, newdata = testset) #预测函数
confusionMatrix(pstate3, factor(testset$state)) #混淆矩阵
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0  908  238
##          1  187 2044
##                                           
##                Accuracy : 0.8741          
##                  95% CI : (0.8625, 0.8852)
##     No Information Rate : 0.6757          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.7163          
##  Mcnemar's Test P-Value : 0.01529         
##                                           
##             Sensitivity : 0.8292          
##             Specificity : 0.8957          
##          Pos Pred Value : 0.7923          
##          Neg Pred Value : 0.9162          
##              Prevalence : 0.3243          
##          Detection Rate : 0.2689          
##    Detection Prevalence : 0.3394          
##       Balanced Accuracy : 0.8625          
##                                           
##        'Positive' Class : 0               
## 

4.5 CRT

fit4 <- train(factor(state)~ave1+sd1+chongwen+haidian+chaoyang
              +dongcheng+xicheng+xuanwu+rush1+rush2+stateBef, data = trainset,method = "rpart") #训练函数
pstate4 <- predict(fit4, newdata = testset) #预测函数
confusionMatrix(pstate4, factor(testset$state))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0  915  235
##          1  180 2047
##                                          
##                Accuracy : 0.8771         
##                  95% CI : (0.8656, 0.888)
##     No Information Rate : 0.6757         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.7232         
##  Mcnemar's Test P-Value : 0.008031       
##                                          
##             Sensitivity : 0.8356         
##             Specificity : 0.8970         
##          Pos Pred Value : 0.7957         
##          Neg Pred Value : 0.9192         
##              Prevalence : 0.3243         
##          Detection Rate : 0.2710         
##    Detection Prevalence : 0.3405         
##       Balanced Accuracy : 0.8663         
##                                          
##        'Positive' Class : 0              
## 

Try plot(fit4$finalModel)

4.6 Bagging

fit5 <- train(factor(state)~ave1+sd1+chongwen+haidian+chaoyang
              +dongcheng+xicheng+xuanwu+rush1+rush2+stateBef, data = trainset, method = "treebag")
pstate5 <- predict(fit5, newdata = testset)
confusionMatrix(pstate5, factor(testset$state))

4.7 Boosting

# Boosting
fit6 <- train(factor(state)~ave1+sd1+chongwen+haidian+chaoyang
              +dongcheng+xicheng+xuanwu+rush1+rush2+stateBef, data = trainset, method = "gam")
pstate6 <- predict(fit6, newdata = testset)
confusionMatrix(pstate6, factor(testset$state))

4.8 Random Forest

# Random Forest
fit7 <- train(factor(state)~ave1+sd1+chongwen+haidian+chaoyang
              +dongcheng+xicheng+xuanwu+rush1+rush2+stateBef, data = trainset, method = "rf")
pstate7 <- predict(fit7, newdata = testset)
confusionMatrix(pstate7, factor(testset$state))

4.9 Ensemble learning

combPre <- data.frame(pstate3, pstate4, pstate5, pstate6, pstate7, state = testset$state) #集成
combfit <- train(factor(state)~., method = "gam", data = combPre)
combpstate <- predict(combfit, newdata = testset)
confusionMatrix(combpstate, factor(testset$state))