一言不合就学R

商务大数据智能分析R3

Lecturer : 申 旌 周
jingzhou_shen@stu.xjtu.edu.cn

Instructor : 常 象 宇
xiangyuchang@xjtu.edu.cn

2017年3月19日

概览

  • 函数

  • 作图

函数

  • 定义
  • 参数
  • 封装

函数 - 定义

rsummary <- function(x) {
  return(
    list(
    mean = mean(x), 
    sd = sd(x),
    quantiles = quantile(x)))
}
x <- rnorm(100, mean = 6, sd = 1)
rsummary(x)
$mean
[1] 5.895624

$sd
[1] 1.016297

$quantiles
      0%      25%      50%      75%     100% 
3.518802 5.106896 5.869846 6.557136 8.789925 

函数 - 定义

calGrade <- function(x, base = 59) {
  s <- x + base
  if(s >= 90) {
    grade <- "优"
  } else if(s >= 80) {
    grade <- "良"
  } else if(s >= 70) {
    grade <- "中"
  } else if(x >= 60){
    grade <- "及格"
  }else{
    grade <- "仍需努力"
  }
  grade
}
scores <- c(40, 20, 27, 31, 0, 0)

函数 - 定义

calGrade(scores) # 注意与下面语句的区别
[1] "优"
lapply(scores, calGrade) # 复习lapply
[[1]]
[1] "优"

[[2]]
[1] "中"

[[3]]
[1] "良"

[[4]]
[1] "优"

[[5]]
[1] "仍需努力"

[[6]]
[1] "仍需努力"

函数 - 参数

  • formals
formals(calGrade)
$x


$base
[1] 59
formals(plot)
$x


$y


$...
  • 调用函数时,参数可以缺失,可能有缺省值

函数 - 参数

  • 参数匹配
(M <- matrix(1:6, nrow = 2)) # 精确匹配
     [,1] [,2] [,3]
[1,]    1    3    5
[2,]    2    4    6
(M <- matrix(1:6, nr = 2)) # 部分匹配
     [,1] [,2] [,3]
[1,]    1    3    5
[2,]    2    4    6
(M <- matrix(1:6, 2)) # 位置匹配
     [,1] [,2] [,3]
[1,]    1    3    5
[2,]    2    4    6

函数 - 参数

  • ...

quantile(x, probs = seq(0, 1, 0.25), ...)
apply(X, MARGIN, FUN, ...)

set.seed(1)
(x <- matrix(rnorm(12), 3)) 
           [,1]       [,2]      [,3]       [,4]
[1,] -0.6264538  1.5952808 0.4874291 -0.3053884
[2,]  0.1836433  0.3295078 0.7383247  1.5117812
[3,] -0.8356286 -0.8204684 0.5757814  0.3898432
apply(x, 2, quantile)  # matrix
           [,1]       [,2]      [,3]        [,4]
0%   -0.8356286 -0.8204684 0.4874291 -0.30538839
25%  -0.7310412 -0.2454803 0.5316052  0.04222742
50%  -0.6264538  0.3295078 0.5757814  0.38984324
75%  -0.2214052  0.9623943 0.6570530  0.95081220
100%  0.1836433  1.5952808 0.7383247  1.51178117

函数 - 参数

  • ...

quantile(x, probs = seq(0, 1, 0.25), ...)
apply(X, MARGIN, FUN, ...)

set.seed(1)
(x <- matrix(rnorm(12), 3))
           [,1]       [,2]      [,3]       [,4]
[1,] -0.6264538  1.5952808 0.4874291 -0.3053884
[2,]  0.1836433  0.3295078 0.7383247  1.5117812
[3,] -0.8356286 -0.8204684 0.5757814  0.3898432
apply(x, 2, quantile, probs = c(.25, .75))
          [,1]       [,2]      [,3]       [,4]
25% -0.7310412 -0.2454803 0.5316052 0.04222742
75% -0.2214052  0.9623943 0.6570530 0.95081220

函数 - 参数

  • ...
str(paste)
function (..., sep = " ", collapse = NULL)  
  • GOOD
paste("Hi", " There", sep = "!")
[1] "Hi! There"
  • BAD
paste("Hi", " There", se = "!")
[1] "Hi  There !"

函数 - 参数

  • ...
x <- 1:30
plot(x, log(x))

plot of chunk unnamed-chunk-17

函数 - 参数

  • ...
my.plot <- 
  function(x, y, type = "l", ...){
     plot(x, y, type = type, ...)
}  # 自定义函数
my.plot(x, log(x))

plot of chunk unnamed-chunk-19

函数 - 封装

  • Obsessive Compulsive Disorder, OCD

函数 - 封装

  • 主文件调用函数文件
source("./MyPlotEncap.R")  
my.plot.encap(x, log(x))

plot of chunk unnamed-chunk-20

概览

- 函数

- 作图

作图

  • plot
  • ggplot2
  • 其它

plot

  • 数据
  • 单变量作图
  • 多变量作图
  • 存储

plot - 数据

library(MASS)
str(birthwt)
'data.frame':   189 obs. of  10 variables:
 $ low  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ age  : int  19 33 20 21 18 21 22 17 29 26 ...
 $ lwt  : int  182 155 105 108 107 124 118 103 123 113 ...
 $ race : int  2 3 1 1 1 3 1 3 1 1 ...
 $ smoke: int  0 0 1 1 1 0 0 0 1 1 ...
 $ ptl  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ ht   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ ui   : int  1 0 0 1 1 0 0 0 0 0 ...
 $ ftv  : int  0 3 1 2 0 0 1 1 1 0 ...
 $ bwt  : int  2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...

plot - 数据

head(birthwt, 4)
   low age lwt race smoke ptl ht ui ftv  bwt
85   0  19 182    2     0   0  0  1   0 2523
86   0  33 155    3     0   0  0  0   3 2551
87   0  20 105    1     1   0  0  0   1 2557
88   0  21 108    1     1   0  0  1   2 2594
colnames(birthwt) <- 
  c("birthwt.below.2500", 
    "mother.age", "mother.weight",  
    "race", "mother.smokes", "previous.prem.labor",
    "hypertension", "uterine.irr", "physician.visits", 
    "birthwt.grams")
birthwt$mother.smokes <- as.factor(birthwt$mother.smokes)
levels(birthwt$mother.smokes)
[1] "0" "1"

plot - 单变量

plot(birthwt$mother.age)

plot of chunk unnamed-chunk-26

plot - 单变量

hist(birthwt$mother.age)

plot of chunk unnamed-chunk-27

plot - 单变量

with(birthwt, plot (mother.age, birthwt.grams, 
xlab = "孕期妈妈年龄", 
ylab = "新生儿体重(克)",
col = mother.smokes,
pch = 19, # 实心点
cex = 0.7)) # 点的大小
abline(h = 2500)
legend("bottomright", c("不抽烟","抽烟"), col=c(1,2), pch=19)

plot of chunk unnamed-chunk-28

plot - 单变量

plot(birthwt$mother.smokes)

plot of chunk unnamed-chunk-29

  • plot(x, ...) 的作图行为因 class(x) 而变

plot - 单变量

plot(birthwt$mother.smokes, 
     main = "孕期妈妈抽烟分布图", 
     xlab = "孕期妈妈是否抽烟", 
     ylab = "数量",
     col = "lightblue")

plot of chunk unnamed-chunk-30

plot - 多变量

plot - 多变量

par(mfrow = c(1, 1))
with(birthwt, 
     plot(mother.smokes, 
          birthwt.grams, 
     xlab = "孕期妈妈是否抽烟", 
     ylab = "新生儿体重(克)"))

plot of chunk unnamed-chunk-31

plot - 多变量

with(birthwt, 
     plot(physician.visits, 
          birthwt.grams,
     xlab = "就诊数量", 
     ylab = "新生儿体重(克)",
     col = 'lightblue'))

plot of chunk unnamed-chunk-32

plot - 多变量

with(birthwt,      plot(as.factor(physician.visits),     birthwt.grams,
     xlab = "就诊数量", 
     ylab = "新生儿体重(克)",
     col = 'lightblue'))

plot of chunk unnamed-chunk-33

plot - 存储

  • png
png(file= "scatter.png", bg = "white", res = 120)
  with(birthwt,      
       plot(as.factor(physician.visits),     
            birthwt.grams,
            xlab = "就诊数量", 
            ylab = "新生儿体重(克)",
            col = 'lightblue'))
dev.off()
  • pdf
  • jpeg
  • 工作目录
    setwd()
    getwd()

作图

  • plot
  • ggplot2
  • 其它

ggplot2

  • PK
  • 概念
  • 数据
  • 柱状图
  • 散点图
  • 折线图
  • 标签
  • 存储

ggplot2 v.s. plot

  • plot
with(birthwt, 
plot(mother.age, 
     birthwt.grams))

plot of chunk unnamed-chunk-35

ggplot2 v.s. plot

  • qplot (quick plot)
library(ggplot2)
qplot(x = mother.age, y = birthwt.grams, data = birthwt)

plot of chunk unnamed-chunk-36

ggplot2 v.s. plot

  • qplot
birthwt$race <- as.factor(birthwt$race)
qplot(x = mother.age, y = birthwt.grams, data = birthwt, color = race, shape = race, # 1 = white, 2 = black, 3 = other
xlab="孕期妈妈年龄",ylab="新生儿体重(克)") 

plot of chunk unnamed-chunk-38

ggplot2 - Grammar of Graphics

  • 核心: 面向图形[对象]
  • 数据(data)
  • 图形映射(mapping)
  • 几何对象(geom)
  • 统计变换(stats)
  • 分面(facet)
  • 标尺(scale)
  • 坐标系(coord)
  • 主题(theme)
  • 存储(save)

ggplot2 - Grammar of Graphics

  • 标准语法
p <- ggplot(
  data, 
  mapping = aes(x, y, ...) # 图形对象
p <- p + layer( # 图形对象加图层
  geom = "...",
  geom_params = list(...),
  stat = "...",
  stat_params = list(...))
p
  • 简化语法
ggplot(
   data,
   aes(x, y, <other aesthetics>)) +
 geom_XXX() +
 stat_XXX()

ggplot2 - 简化语法

  • 简化语法
ggplot(
   data,
   aes(x, y, 
       <other aesthetics>)) +
 geom_XXX() +
 stat_XXX()

ggplot2 - 简化语法

  • 简化语法

ggplot2 - 数据: mpg

str(mpg)
Classes 'tbl_df', 'tbl' and 'data.frame':   234 obs. of  11 variables:
 $ manufacturer: chr  "audi" "audi" "audi" "audi" ...
 $ model       : chr  "a4" "a4" "a4" "a4" ...
 $ displ       : num  1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
 $ year        : int  1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
 $ cyl         : int  4 4 4 4 6 6 6 4 4 4 ...
 $ trans       : chr  "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
 $ drv         : chr  "f" "f" "f" "f" ...
 $ cty         : int  18 21 20 21 16 18 18 18 16 20 ...
 $ hwy         : int  29 29 31 30 26 26 27 26 25 28 ...
 $ fl          : chr  "p" "p" "p" "p" ...
 $ class       : chr  "compact" "compact" "compact" "compact" ...

ggplot2 - 柱状图

ggplot(mpg, 
      aes(x = drv)) + 
  geom_bar()

plot of chunk unnamed-chunk-43

ggplot2 - 柱状图

ggplot(mpg, 
      aes(x = drv, 
          fill = as.factor(cyl))) + 
  geom_bar()

plot of chunk unnamed-chunk-44

ggplot2 - 柱状图

ggplot(mpg, 
      aes(x = drv, 
          fill = as.factor(cyl))) + 
  geom_bar()

plot of chunk unnamed-chunk-45

ggplot2 - 柱状图

ggplot(mpg, 
      aes(x = drv, 
          fill = as.factor(cyl))) + 
  geom_bar() + 
  coord_flip()

plot of chunk unnamed-chunk-46

ggplot2 - 柱状图

ggplot(mpg, 
       aes(x = drv, 
           fill = as.factor(cyl))) + 
  geom_bar(position='dodge')

plot of chunk unnamed-chunk-47

ggplot2 - 柱状图

library(plyr)
mpg.avg <- ddply(
 mpg, 
 c('drv', 'cyl'), 
 summarize, 
 avg.hwy = mean(hwy))
mpg.avg
  drv cyl  avg.hwy
1   4   4 24.60870
2   4   6 19.50000
3   4   8 16.35417
4   f   4 30.46552
5   f   5 28.75000
6   f   6 25.06977
7   f   8 25.00000
8   r   6 25.25000
9   r   8 20.19048

ggplot2 - 柱状图

  • 默认: stat = 'count'
ggplot(mpg,
aes(x = drv, fill = as.factor(cyl))) + geom_bar(position='dodge', 
stat = 'count')

plot of chunk unnamed-chunk-50

ggplot2 - 柱状图

  • 感兴趣的统计量:stat='identity'
ggplot(mpg.avg, 
aes(x=drv, fill = as.factor(cyl), y = avg.hwy)) + geom_bar(position='dodge', stat='identity')

plot of chunk unnamed-chunk-51

ggplot2 - 散点图

  • argument: user dependent
ggplot(mpg, 
aes(x=displ, y=hwy)) + 
geom_point(color='blue', 
           size=2.5)

plot of chunk unnamed-chunk-52

ggplot2 - 散点图

  • mapping: data dependent
ggplot(mpg, 
aes(x=displ, y=hwy, color=class)) + 

geom_point(size=2.5)

plot of chunk unnamed-chunk-53

ggplot2 - 分面:以散点图为例

ggplot(mpg, aes(x=displ, y=hwy)) + geom_point() + facet_wrap("class")

plot of chunk unnamed-chunk-54

ggplot2 - 多数据集:以散点图为例

ggplot(
  mpg, 
  aes(x=displ, y=hwy)) + geom_point() + 
facet_grid(drv ~ cyl)

plot of chunk unnamed-chunk-55

ggplot2 - 多数据集:以散点图为例

mpg.no.facet <- subset(mpg, 
  select = c('displ', 'hwy'))
ggplot(mpg, aes(x=displ, y=hwy)) + geom_point() +  
facet_grid(drv ~ cyl) + geom_point(data = mpg.no.facet, color = 'grey', size = 1)

plot of chunk unnamed-chunk-57

ggplot2 - 折线图

library(plyr)
mpg.avg <- ddply(
 mpg, 
 c('drv', 'cyl'), 
 summarize, 
 avg.hwy = mean(hwy))
mpg.avg
  drv cyl  avg.hwy
1   4   4 24.60870
2   4   6 19.50000
3   4   8 16.35417
4   f   4 30.46552
5   f   5 28.75000
6   f   6 25.06977
7   f   8 25.00000
8   r   6 25.25000
9   r   8 20.19048

ggplot2 - 折线图

ggplot(
  mpg.avg,
  aes(x=drv, y=avg.hwy,
  color=as.factor(cyl), group = cyl)) + 
geom_line() + 
geom_point(size=4)

plot of chunk unnamed-chunk-60

ggplot2 - 折线图

ggplot(mpg.avg, 
aes(x=drv, fill = as.factor(cyl), y = avg.hwy)) + geom_bar(position='dodge', stat='identity') # 对比用

plot of chunk unnamed-chunk-61

ggplot2 - 标签

mpg.means <- ddply(mpg, 'class', summarize, avg.cty = mean(cty))
mpg.means
       class  avg.cty
1    2seater 15.40000
2    compact 20.12766
3    midsize 18.75610
4    minivan 15.81818
5     pickup 13.00000
6 subcompact 20.37143
7        suv 13.50000

ggplot2 - 标签

mpg.plot <- ggplot(mpg.means, aes(x=class, y = avg.cty)) + geom_bar(stat='identity')
mpg.plot

plot of chunk unnamed-chunk-65

ggplot2 - 标签

mpg.plot <- mpg.plot + 
  labs(
    x = '', y = '英里每加仑',
    title='不同车型的城市里程') 
mpg.plot

plot of chunk unnamed-chunk-67

ggplot2 - 标签

mpg.plot + theme(
  text = element_text(size=22), 
  axis.text.x = element_text(
                 angle=45, 
                 vjust=1, 
                 hjust=1))

plot of chunk unnamed-chunk-68

ggplot2 - 存储

  • 存图像
ggsave("./fig/plot.png", width = 8, height = 6)
ggsave("./fig/plot.pdf", width = 8, height = 6)
   格式         
1  "eps"        
2  "ps"         
3  "tex(pictex)"
4  "pdf"        
5  "jpeg"       
6  "tiff"       
7  "png"        
8  "bmp"        
9  "svg"        
10 "wmf"        
  • 存图像数据
save(mpg.plot, file = "plot.rdata") 

作图 - 其它

Thank you