一言不合就学R

商务大数据智能分析R6

Lecturer : 申 旌 周
jingzhou_shen@stu.xjtu.edu.cn

Instructor : 常 象 宇
xiangyuchang@xjtu.edu.cn

2017年4月16日

概览

  • MXNet
  • XGBoost

MXNet

  • 多层神经网络
  • 自定义神经网络
  • 手写数字竞赛

安装加载包

# install.packages("drat")
# drat:::addRepo("dmlc")
# install.packages("mxnet")
library(mxnet)

多层神经网络 - 函数

  • mx.mlp

    Usage
    mx.mlp(data, label, hidden_node = 1, out_node, dropout = NULL, activation = "tanh", out_activation = "softmax", device = mx.ctx.default(), ...)

  • 训练数据与预测变量

  • 每个隐藏层的大小

  • 输出层的结点数

  • 激活函数类型

  • 损失函数类型

  • 进行训练的硬件(CPU还是GPU)

  • 其他传给mx.model.FeedForward.create的高级参数

多层神经网络 - 数据

  • Sonar:一个二分类数据
  • 划分训练集和测试集
require(mlbench)
data(Sonar, package="mlbench")
dim(Sonar)
[1] 208  61
Sonar[,61] = as.numeric(Sonar[,61])-1
train.ind = c(1:50, 100:150)
train.x = data.matrix(Sonar[train.ind, 1:60])
train.y = Sonar[train.ind, 61]
test.x = data.matrix(Sonar[-train.ind, 1:60])
test.y = Sonar[-train.ind, 61]

多层神经网络 - 训练

mx.set.seed(0)
model <- mx.mlp(train.x, train.y, hidden_node=10, out_node=2,      out_activation="softmax", num.round=9, array.batch.size=15, learning.rate=0.07, momentum=0.9, eval.metric=mx.metric.accuracy)
Start training with 1 devices
[1] Train-accuracy=0.488888888888889
[2] Train-accuracy=0.514285714285714
[3] Train-accuracy=0.514285714285714
[4] Train-accuracy=0.514285714285714
[5] Train-accuracy=0.514285714285714
[6] Train-accuracy=0.523809523809524
[7] Train-accuracy=0.619047619047619
[8] Train-accuracy=0.695238095238095
[9] Train-accuracy=0.695238095238095

多层神经网络 - 预测

preds = predict(model, test.x)
pred.label = max.col(t(preds))-1
table(pred.label, test.y)
          test.y
pred.label  0  1
         0 24 14
         1 36 33

自定义神经网络 - 数据

  • BostonHousing
  • 划分训练集和测试集
data(BostonHousing, package="mlbench")
dim(BostonHousing)
[1] 506  14
train.ind = seq(1, 506, 3)
train.x = data.matrix(BostonHousing[train.ind, -14])
train.y = BostonHousing[train.ind, 14]
test.x = data.matrix(BostonHousing[-train.ind, -14])
test.y = BostonHousing[-train.ind, 14]

自定义神经网络 - 定义

# 定义输入数据
data <- mx.symbol.Variable("data")

# 完整连接的隐藏层
# data: 输入源
# num_hidden: 该层的节点数
fc1 <- mx.symbol.FullyConnected(data, num_hidden=1)

# 针对回归任务,定义损失函数
lro <- mx.symbol.LinearRegressionOutput(fc1)

自定义神经网络 - 训练与预测

  • 针对回归任务修改了eval.metric参数
mx.set.seed(0)
model <- mx.model.FeedForward.create(lro, X=train.x, y=train.y, ctx=mx.cpu(), num.round=6, array.batch.size=20, learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse)
  • 其它的评价函数: accuracy, rmse, maermsle

  • 用户也可以根据需要自定义评价函数,例如:

demo.metric.mae <- mx.metric.custom("mae", function(label, pred) {
  res <- mean(abs(label-pred))
  return(res)
})
mx.set.seed(0)
model <- mx.model.FeedForward.create(lro, X=train.x, y=train.y, ctx=mx.cpu(), num.round=6, array.batch.size=20, learning.rate=2e-6, momentum=0.9, eval.metric=demo.metric.mae)
  • 预测
predict(model, test.x)

手写数字竞赛 - 数据

  • 划分训练集和测试集
train <- read.csv('./demo/data/train.csv', header=TRUE)
test <- read.csv('./demo/data/test.csv', header=TRUE)
train <- data.matrix(train)
test <- data.matrix(test)

train.x <- train[,-1]
train.y <- train[,1]

train.x <- t(train.x/255)
test <- t(test/255)
  • 数据格式转换
train.array <- train.x
dim(train.array) <- c(28, 28, 1, ncol(train.x))
test.array <- test
dim(test.array) <- c(28, 28, 1, ncol(test))

手写数字竞赛 - 网络定义

# input
data <- mx.symbol.Variable('data')
# first conv
conv1 <- mx.symbol.Convolution(data=data, kernel=c(5,5), num_filter=20)
tanh1 <- mx.symbol.Activation(data=conv1, act_type="tanh")
pool1 <- mx.symbol.Pooling(data=tanh1, pool_type="max",
                          kernel=c(2,2), stride=c(2,2))
# second conv
conv2 <- mx.symbol.Convolution(data=pool1, kernel=c(5,5), num_filter=50)
tanh2 <- mx.symbol.Activation(data=conv2, act_type="tanh")
pool2 <- mx.symbol.Pooling(data=tanh2, pool_type="max",
                          kernel=c(2,2), stride=c(2,2))
# first fullc
flatten <- mx.symbol.Flatten(data=pool2)
fc1 <- mx.symbol.FullyConnected(data=flatten, num_hidden=500)
tanh3 <- mx.symbol.Activation(data=fc1, act_type="tanh")
# second fullc
fc2 <- mx.symbol.FullyConnected(data=tanh3, num_hidden=10)
# loss
lenet <- mx.symbol.SoftmaxOutput(data=fc2)

手写数字竞赛 - 模型训练

  • 使用CPU训练模型,设置迭代次数为1
device.cpu <- mx.cpu()
mx.set.seed(0)
tic <- proc.time()
model <- mx.model.FeedForward.create(lenet, X=train.array, y=train.y, ctx=device.cpu, num.round=1, array.batch.size=100, learning.rate=0.05, momentum=0.9, wd=0.00001, eval.metric=mx.metric.accuracy, epoch.end.callback=mx.callback.log.train.metric(100))
# Start training with 1 devices
# [1] Train-accuracy=0.55708830548926
print(proc.time() - tic)
# user  system elapsed 
#  298.56   31.97  112.69

手写数字竞赛 - 预测

preds <- predict(model, test.array)
pred.label <- max.col(t(preds)) - 1
submission <- data.frame(ImageId=1:ncol(test), Label=pred.label)
write.csv(submission, file='submission.csv', row.names=FALSE, quote=FALSE)

XGBoost

  • 数据准备
  • 模型训练与预测
  • 交叉验证

数据准备

# install.packages("xgboost")
library("xgboost")
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
class(train$data)
[1] "dgCMatrix"
attr(,"package")
[1] "Matrix"

模型训练与预测

bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nround = 2, objective = "binary:logistic")
[1] train-error:0.046522 
[2] train-error:0.022263 
pred <- predict(bst, test$data)
pred[1:5]
[1] 0.28583017 0.92392391 0.28583017 0.28583017 0.05169873

交叉验证

cv.res <- xgb.cv(data = train$data, label = train$label, max.depth = 2, eta = 1, nround = 2, objective = "binary:logistic", nfold = 5)
[1] train-error:0.046522+0.001224   test-error:0.046522+0.004893 
[2] train-error:0.022263+0.001073   test-error:0.022264+0.004292 
cv.res
##### xgb.cv 5-folds
 iter train_error_mean train_error_std test_error_mean test_error_std
    1        0.0465222     0.001223597       0.0465224    0.004893428
    2        0.0222632     0.001073064       0.0222642    0.004291863

参考资料

Thank you