商务大数据智能分析 之 R3
Lecturer   : 申 旌 周
jingzhou_shen@stu.xjtu.edu.cn  
Instructor : 常 象 宇
xiangyuchang@xjtu.edu.cn
2017年3月19日
函数
作图
rsummary <- function(x) {
  return(
    list(
    mean = mean(x), 
    sd = sd(x),
    quantiles = quantile(x)))
}
x <- rnorm(100, mean = 6, sd = 1)
rsummary(x)
$mean
[1] 5.895624
$sd
[1] 1.016297
$quantiles
      0%      25%      50%      75%     100% 
3.518802 5.106896 5.869846 6.557136 8.789925 
calGrade <- function(x, base = 59) {
  s <- x + base
  if(s >= 90) {
    grade <- "优"
  } else if(s >= 80) {
    grade <- "良"
  } else if(s >= 70) {
    grade <- "中"
  } else if(x >= 60){
    grade <- "及格"
  }else{
    grade <- "仍需努力"
  }
  grade
}
scores <- c(40, 20, 27, 31, 0, 0)
calGrade(scores) # 注意与下面语句的区别
[1] "优"
lapply(scores, calGrade) # 复习lapply
[[1]]
[1] "优"
[[2]]
[1] "中"
[[3]]
[1] "良"
[[4]]
[1] "优"
[[5]]
[1] "仍需努力"
[[6]]
[1] "仍需努力"
formalsformals(calGrade)
$x
$base
[1] 59
formals(plot)
$x
$y
$...
(M <- matrix(1:6, nrow = 2)) # 精确匹配
     [,1] [,2] [,3]
[1,]    1    3    5
[2,]    2    4    6
(M <- matrix(1:6, nr = 2)) # 部分匹配
     [,1] [,2] [,3]
[1,]    1    3    5
[2,]    2    4    6
(M <- matrix(1:6, 2)) # 位置匹配
     [,1] [,2] [,3]
[1,]    1    3    5
[2,]    2    4    6
...
quantile(x, probs = seq(0, 1, 0.25), ...)
apply(X, MARGIN, FUN, ...)
set.seed(1)
(x <- matrix(rnorm(12), 3)) 
           [,1]       [,2]      [,3]       [,4]
[1,] -0.6264538  1.5952808 0.4874291 -0.3053884
[2,]  0.1836433  0.3295078 0.7383247  1.5117812
[3,] -0.8356286 -0.8204684 0.5757814  0.3898432
apply(x, 2, quantile)  # matrix
           [,1]       [,2]      [,3]        [,4]
0%   -0.8356286 -0.8204684 0.4874291 -0.30538839
25%  -0.7310412 -0.2454803 0.5316052  0.04222742
50%  -0.6264538  0.3295078 0.5757814  0.38984324
75%  -0.2214052  0.9623943 0.6570530  0.95081220
100%  0.1836433  1.5952808 0.7383247  1.51178117
...
quantile(x, probs = seq(0, 1, 0.25), ...)
apply(X, MARGIN, FUN, ...)
set.seed(1)
(x <- matrix(rnorm(12), 3))
           [,1]       [,2]      [,3]       [,4]
[1,] -0.6264538  1.5952808 0.4874291 -0.3053884
[2,]  0.1836433  0.3295078 0.7383247  1.5117812
[3,] -0.8356286 -0.8204684 0.5757814  0.3898432
apply(x, 2, quantile, probs = c(.25, .75))
          [,1]       [,2]      [,3]       [,4]
25% -0.7310412 -0.2454803 0.5316052 0.04222742
75% -0.2214052  0.9623943 0.6570530 0.95081220
...str(paste)
function (..., sep = " ", collapse = NULL)  
paste("Hi", " There", sep = "!")
[1] "Hi! There"
paste("Hi", " There", se = "!")
[1] "Hi  There !"
...x <- 1:30
plot(x, log(x))
...my.plot <- 
  function(x, y, type = "l", ...){
     plot(x, y, type = type, ...)
}  # 自定义函数
my.plot(x, log(x))
source("./MyPlotEncap.R")  
my.plot.encap(x, log(x))
library(MASS)
str(birthwt)
'data.frame':   189 obs. of  10 variables:
 $ low  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ age  : int  19 33 20 21 18 21 22 17 29 26 ...
 $ lwt  : int  182 155 105 108 107 124 118 103 123 113 ...
 $ race : int  2 3 1 1 1 3 1 3 1 1 ...
 $ smoke: int  0 0 1 1 1 0 0 0 1 1 ...
 $ ptl  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ ht   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ ui   : int  1 0 0 1 1 0 0 0 0 0 ...
 $ ftv  : int  0 3 1 2 0 0 1 1 1 0 ...
 $ bwt  : int  2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
head(birthwt, 4)
   low age lwt race smoke ptl ht ui ftv  bwt
85   0  19 182    2     0   0  0  1   0 2523
86   0  33 155    3     0   0  0  0   3 2551
87   0  20 105    1     1   0  0  0   1 2557
88   0  21 108    1     1   0  0  1   2 2594
colnames(birthwt) <- 
  c("birthwt.below.2500", 
    "mother.age", "mother.weight",  
    "race", "mother.smokes", "previous.prem.labor",
    "hypertension", "uterine.irr", "physician.visits", 
    "birthwt.grams")
birthwt$mother.smokes <- as.factor(birthwt$mother.smokes)
levels(birthwt$mother.smokes)
[1] "0" "1"
plot(birthwt$mother.age)
hist(birthwt$mother.age)
with(birthwt, plot (mother.age, birthwt.grams, 
xlab = "孕期妈妈年龄", 
ylab = "新生儿体重(克)",
col = mother.smokes,
pch = 19, # 实心点
cex = 0.7)) # 点的大小
abline(h = 2500)
legend("bottomright", c("不抽烟","抽烟"), col=c(1,2), pch=19)
plot(birthwt$mother.smokes)
plot(x, ...) 的作图行为因 class(x) 而变plot(birthwt$mother.smokes, 
     main = "孕期妈妈抽烟分布图", 
     xlab = "孕期妈妈是否抽烟", 
     ylab = "数量",
     col = "lightblue")
par(mfrow = c(1, 1))
with(birthwt, 
     plot(mother.smokes, 
          birthwt.grams, 
     xlab = "孕期妈妈是否抽烟", 
     ylab = "新生儿体重(克)"))
with(birthwt, 
     plot(physician.visits, 
          birthwt.grams,
     xlab = "就诊数量", 
     ylab = "新生儿体重(克)",
     col = 'lightblue'))
with(birthwt,      plot(as.factor(physician.visits),     birthwt.grams,
     xlab = "就诊数量", 
     ylab = "新生儿体重(克)",
     col = 'lightblue'))
png(file= "scatter.png", bg = "white", res = 120)
  with(birthwt,      
       plot(as.factor(physician.visits),     
            birthwt.grams,
            xlab = "就诊数量", 
            ylab = "新生儿体重(克)",
            col = 'lightblue'))
dev.off()
setwd()getwd()with(birthwt, 
plot(mother.age, 
     birthwt.grams))
library(ggplot2)
qplot(x = mother.age, y = birthwt.grams, data = birthwt)
birthwt$race <- as.factor(birthwt$race)
qplot(x = mother.age, y = birthwt.grams, data = birthwt, color = race, shape = race, # 1 = white, 2 = black, 3 = other
xlab="孕期妈妈年龄",ylab="新生儿体重(克)") 
p <- ggplot(
  data, 
  mapping = aes(x, y, ...) # 图形对象
p <- p + layer( # 图形对象加图层
  geom = "...",
  geom_params = list(...),
  stat = "...",
  stat_params = list(...))
p
ggplot(
   data,
   aes(x, y, <other aesthetics>)) +
 geom_XXX() +
 stat_XXX()
ggplot(
   data,
   aes(x, y, 
       <other aesthetics>)) +
 geom_XXX() +
 stat_XXX()
str(mpg)
Classes 'tbl_df', 'tbl' and 'data.frame':   234 obs. of  11 variables:
 $ manufacturer: chr  "audi" "audi" "audi" "audi" ...
 $ model       : chr  "a4" "a4" "a4" "a4" ...
 $ displ       : num  1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
 $ year        : int  1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
 $ cyl         : int  4 4 4 4 6 6 6 4 4 4 ...
 $ trans       : chr  "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
 $ drv         : chr  "f" "f" "f" "f" ...
 $ cty         : int  18 21 20 21 16 18 18 18 16 20 ...
 $ hwy         : int  29 29 31 30 26 26 27 26 25 28 ...
 $ fl          : chr  "p" "p" "p" "p" ...
 $ class       : chr  "compact" "compact" "compact" "compact" ...
ggplot(mpg, 
      aes(x = drv)) + 
  geom_bar()
ggplot(mpg, 
      aes(x = drv, 
          fill = as.factor(cyl))) + 
  geom_bar()
ggplot(mpg, 
      aes(x = drv, 
          fill = as.factor(cyl))) + 
  geom_bar()
ggplot(mpg, 
      aes(x = drv, 
          fill = as.factor(cyl))) + 
  geom_bar() + 
  coord_flip()
ggplot(mpg, 
       aes(x = drv, 
           fill = as.factor(cyl))) + 
  geom_bar(position='dodge')
library(plyr)
mpg.avg <- ddply(
 mpg, 
 c('drv', 'cyl'), 
 summarize, 
 avg.hwy = mean(hwy))
mpg.avg
  drv cyl  avg.hwy
1   4   4 24.60870
2   4   6 19.50000
3   4   8 16.35417
4   f   4 30.46552
5   f   5 28.75000
6   f   6 25.06977
7   f   8 25.00000
8   r   6 25.25000
9   r   8 20.19048
ggplot(mpg,
aes(x = drv, fill = as.factor(cyl))) + geom_bar(position='dodge', 
stat = 'count')
ggplot(mpg.avg, 
aes(x=drv, fill = as.factor(cyl), y = avg.hwy)) + geom_bar(position='dodge', stat='identity')
ggplot(mpg, 
aes(x=displ, y=hwy)) + 
geom_point(color='blue', 
           size=2.5)
ggplot(mpg, 
aes(x=displ, y=hwy, color=class)) + 
geom_point(size=2.5)
ggplot(mpg, aes(x=displ, y=hwy)) + geom_point() + facet_wrap("class")
ggplot(
  mpg, 
  aes(x=displ, y=hwy)) + geom_point() + 
facet_grid(drv ~ cyl)
mpg.no.facet <- subset(mpg, 
  select = c('displ', 'hwy'))
ggplot(mpg, aes(x=displ, y=hwy)) + geom_point() +  
facet_grid(drv ~ cyl) + geom_point(data = mpg.no.facet, color = 'grey', size = 1)
library(plyr)
mpg.avg <- ddply(
 mpg, 
 c('drv', 'cyl'), 
 summarize, 
 avg.hwy = mean(hwy))
mpg.avg
  drv cyl  avg.hwy
1   4   4 24.60870
2   4   6 19.50000
3   4   8 16.35417
4   f   4 30.46552
5   f   5 28.75000
6   f   6 25.06977
7   f   8 25.00000
8   r   6 25.25000
9   r   8 20.19048
ggplot(
  mpg.avg,
  aes(x=drv, y=avg.hwy,
  color=as.factor(cyl), group = cyl)) + 
geom_line() + 
geom_point(size=4)
ggplot(mpg.avg, 
aes(x=drv, fill = as.factor(cyl), y = avg.hwy)) + geom_bar(position='dodge', stat='identity') # 对比用
mpg.means <- ddply(mpg, 'class', summarize, avg.cty = mean(cty))
mpg.means
       class  avg.cty
1    2seater 15.40000
2    compact 20.12766
3    midsize 18.75610
4    minivan 15.81818
5     pickup 13.00000
6 subcompact 20.37143
7        suv 13.50000
mpg.plot <- ggplot(mpg.means, aes(x=class, y = avg.cty)) + geom_bar(stat='identity')
mpg.plot
mpg.plot <- mpg.plot + 
  labs(
    x = '', y = '英里每加仑',
    title='不同车型的城市里程') 
mpg.plot
mpg.plot + theme(
  text = element_text(size=22), 
  axis.text.x = element_text(
                 angle=45, 
                 vjust=1, 
                 hjust=1))
ggsave("./fig/plot.png", width = 8, height = 6)
ggsave("./fig/plot.pdf", width = 8, height = 6)
   格式         
1  "eps"        
2  "ps"         
3  "tex(pictex)"
4  "pdf"        
5  "jpeg"       
6  "tiff"       
7  "png"        
8  "bmp"        
9  "svg"        
10 "wmf"        
save(mpg.plot, file = "plot.rdata") 
Thank you