1 基于R的机器学习

统计机器学习的实验设计
Caret包简介
北京市出租车数据案例

2 统计机器学习的实验设计

3 Caret

Caret参考手册

Caret是一个R中统计机器学习算法包的集成平台。里面包含了147个各种不同的统计机器学习方法与模型。
Caret包自动化的完成数据的交叉验证与参数选择过程。
Caret包支持平行计算功能，大大提高计算效率。
Data Splitting (Train, Test, Sampling)
Preprocessing Data (Clear, Standardize, Transform, Impute)
Training and Testing Approaches (Train and Predict)
Model Comparison (Cross-validation, Confusion Matrix)

4 北京市出租车数据案例

哪些状态的出租车在下一时刻更容易非空？
哪些指标与出租车下一时刻是否非空相关？
我们能够给出出租车下一时刻非空可能性的定量化估计么？

4.1 Data

样本来源：采用一天的出租车状态数据
筛选运营状态：0=空车，1=载客，共计180188条
按照五分钟汇总指标，最终数据共包含16887个观测（原因：数据采集不平均)

state:五分钟内出租车是否为空（有载客状态则视为非空）

指标设计

上一时刻是否是空车：五分钟内有载客状态则视为载客，否则为空车
速度平均值：五分钟内速度的平均值
速度标准差：五分钟内速度的标准差
是否是早高峰（rush1）：7≤Hour≤9定义为1，否则为0.
是否是晚高峰（rush2）： 17≤Hour≤20定义为1，否则为0.
所在区域：计算五分钟内经纬度平均值并计算与以下区域的差距，选择最近的区域作为车辆五分钟内所在区域。

after <- read.csv("final_data.csv")
head(after, 3)

##   Car_ID Hour five state speed_ave speed_sd    disNam rush1 rush2 stateBef
## 1 100164    0    5     0         0        0 dongcheng FALSE FALSE        0
## 2 100164    0    6     0         0        0 dongcheng FALSE FALSE        0
## 3 100164    0    7     0         0        0 dongcheng FALSE FALSE        0
##   logspeed_ave logspeed_sd      ave1       sd1 chongwen haidian chaoyang
## 1            0           0 -2.179164 -2.248029    FALSE   FALSE    FALSE
## 2            0           0 -2.179164 -2.248029    FALSE   FALSE    FALSE
## 3            0           0 -2.179164 -2.248029    FALSE   FALSE    FALSE
##   dongcheng xicheng xuanwu
## 1      TRUE   FALSE  FALSE
## 2      TRUE   FALSE  FALSE
## 3      TRUE   FALSE  FALSE

4.2 Logistic Regression

set.seed(2017) #设计随机数种子

sample <- sample(dim(after)[1],round(0.8*dim(after)[1])) #产生训练样本的编号

trainset <- after[sample,] #构建训练集

testset <- after[-sample,] #构建测试集

fit1 <- glm(state~ave1+sd1+chongwen+haidian+chaoyang+dongcheng
+xicheng+xuanwu+rush1+rush2+stateBef,family="binomial", data = trainset) #逻辑回归

p1 <- predict(fit1,testset) #测试集合进行预测

score1 <- exp(p1)/(1+exp(p1)) #转化成概率

pstate1 <- rep(0, length(p1)) #选择0.5作为阈值

pstate1[score1>=0.5] <- 1 #赋值

table(pstate1, testset$state)/length(p1) # 构建混淆矩阵

##        
## pstate1          0          1
##       0 0.26887770 0.07047675
##       1 0.05537459 0.60527095

accuracy1 <- sum(diag(table(pstate1,testset$state)/length(p1))) #正确率

accuracy1

## [1] 0.8741487

4.3 Naive Method

# EDA
plot(density(trainset$speed_ave[trainset$state == 0]), 
     col = "red", main = "", xlab = "Frequence of 'ave'")
lines(density(trainset$speed_ave[trainset$state == 1]), col = "blue")
abline(v = 7, col = "black")

pstate2 <- ifelse(testset$speed_ave>7, 1, 0) 
table(pstate2, testset$state)/length(pstate2)

##        
## pstate2         0         1
##       0 0.1539828 0.0740302
##       1 0.1702695 0.6017175

accuracy2 <- sum(diag(table(pstate2,testset$state)/length(pstate2)))
accuracy2

## [1] 0.7557003

4.4 Preprocessing for ML

#install.packages("caret", dependencies = c("Depends", "Suggests"))
library("lattice")
library("ggplot2")
library("caret")

## Warning: package 'caret' was built under R version 3.4.4

set.seed(2017)
inTrain <- createDataPartition(y = after$state, p = 0.8, list = FALSE) #重要的数据划分函数
trainset <- after[inTrain, ]
testset <- after[-inTrain, ]
dim(trainset)

## [1] 13510    20

dim(testset)

## [1] 3377   20

folds <- createFolds(y = after$state, k = 5, list = TRUE, returnTrain = TRUE)
sapply(folds, length)

## Fold1 Fold2 Fold3 Fold4 Fold5 
## 13510 13510 13510 13509 13509

folds$Fold1[1:10]

##  [1]  3  4  5  6  7  8  9 10 11 12

– Try createResample() and createTimeSlices().

mean(after$speed_ave)

> [1] 22.84735

sd(after$speed_ave)

> [1] 18.18804

train_ave <- (trainset$speed_ave - mean(trainset$speed_ave))/sd(trainset$speed_ave)
mean(train_ave)

> [1] -7.893213e-17

sd(train_ave)

> [1] 1

preObj <- preProcess(trainset[,-7], method = c("center", "scale")) #数据预处理函数

speed_ave <- predict(preObj, trainset[,-7])$speed_ave

mean(speed_ave)

## [1] -7.893213e-17

sd(speed_ave)

## [1] 1

– Try other methods, e.g., impute, cox-box trnsform.

LR in Caret

fit3 <- train(factor(state)~ave1+sd1+chongwen+haidian
              +chaoyang+dongcheng+xicheng+xuanwu+rush1+rush2+stateBef, data = trainset, method = "glm",family="binomial") #训练函数

pstate3 <- predict(fit3, newdata = testset) #预测函数

confusionMatrix(pstate3, factor(testset$state)) #混淆矩阵

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0  908  238
##          1  187 2044
##                                           
##                Accuracy : 0.8741          
##                  95% CI : (0.8625, 0.8852)
##     No Information Rate : 0.6757          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.7163          
##  Mcnemar's Test P-Value : 0.01529         
##                                           
##             Sensitivity : 0.8292          
##             Specificity : 0.8957          
##          Pos Pred Value : 0.7923          
##          Neg Pred Value : 0.9162          
##              Prevalence : 0.3243          
##          Detection Rate : 0.2689          
##    Detection Prevalence : 0.3394          
##       Balanced Accuracy : 0.8625          
##                                           
##        'Positive' Class : 0               
##

4.5 CRT

fit4 <- train(factor(state)~ave1+sd1+chongwen+haidian+chaoyang
              +dongcheng+xicheng+xuanwu+rush1+rush2+stateBef, data = trainset,method = "rpart") #训练函数
pstate4 <- predict(fit4, newdata = testset) #预测函数

confusionMatrix(pstate4, factor(testset$state))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0  915  235
##          1  180 2047
##                                          
##                Accuracy : 0.8771         
##                  95% CI : (0.8656, 0.888)
##     No Information Rate : 0.6757         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.7232         
##  Mcnemar's Test P-Value : 0.008031       
##                                          
##             Sensitivity : 0.8356         
##             Specificity : 0.8970         
##          Pos Pred Value : 0.7957         
##          Neg Pred Value : 0.9192         
##              Prevalence : 0.3243         
##          Detection Rate : 0.2710         
##    Detection Prevalence : 0.3405         
##       Balanced Accuracy : 0.8663         
##                                          
##        'Positive' Class : 0              
##

Try plot(fit4$finalModel)

4.6 Bagging

fit5 <- train(factor(state)~ave1+sd1+chongwen+haidian+chaoyang
              +dongcheng+xicheng+xuanwu+rush1+rush2+stateBef, data = trainset, method = "treebag")
pstate5 <- predict(fit5, newdata = testset)

confusionMatrix(pstate5, factor(testset$state))

4.7 Boosting

# Boosting
fit6 <- train(factor(state)~ave1+sd1+chongwen+haidian+chaoyang
              +dongcheng+xicheng+xuanwu+rush1+rush2+stateBef, data = trainset, method = "gam")
pstate6 <- predict(fit6, newdata = testset)

confusionMatrix(pstate6, factor(testset$state))

4.8 Random Forest

# Random Forest
fit7 <- train(factor(state)~ave1+sd1+chongwen+haidian+chaoyang
              +dongcheng+xicheng+xuanwu+rush1+rush2+stateBef, data = trainset, method = "rf")
pstate7 <- predict(fit7, newdata = testset)

confusionMatrix(pstate7, factor(testset$state))

4.9 Ensemble learning

combPre <- data.frame(pstate3, pstate4, pstate5, pstate6, pstate7, state = testset$state) #集成
combfit <- train(factor(state)~., method = "gam", data = combPre)
combpstate <- predict(combfit, newdata = testset)

confusionMatrix(combpstate, factor(testset$state))

R语言及其在社会分析中的应用

常象宇

Update 03/07/2018

1 基于R的机器学习

2 统计机器学习的实验设计

3 Caret

4 北京市出租车数据案例

4.1 Data

4.2 Logistic Regression

4.3 Naive Method

4.4 Preprocessing for ML

4.5 CRT

4.6 Bagging

4.7 Boosting

4.8 Random Forest

4.9 Ensemble learning