1 R的探索性数据分析与可视化

  • 基本作图:plot

  • 高级作图:ggplot2

  • 交互式作图:plotly

2 实例分析 - Air Pollution

2.1 Read Data

#Read U.S. air pollution data (2008-2010)
pollution <- read.csv("avgpm25.csv", colClass = c("numeric","character","factor","numeric","numeric"))

head(pollution)
##        pm25 fips region longitude latitude
## 1  9.771185 1003   east -87.74826 30.59278
## 2  9.993817 1027   east -85.84286 33.26581
## 3 10.688618 1033   east -87.72596 34.73148
## 4 11.337424 1049   east -85.79892 34.45913
## 5 12.119764 1055   east -86.03212 34.01860
## 6 10.827805 1069   east -85.35039 31.18973
#Summary the data
summary(pollution)
##       pm25            fips            region      longitude      
##  Min.   : 3.383   Length:576         east:442   Min.   :-158.04  
##  1st Qu.: 8.549   Class :character   west:134   1st Qu.: -97.38  
##  Median :10.047   Mode  :character              Median : -87.37  
##  Mean   : 9.836                                 Mean   : -91.65  
##  3rd Qu.:11.356                                 3rd Qu.: -80.72  
##  Max.   :18.441                                 Max.   : -68.26  
##     latitude    
##  Min.   :19.68  
##  1st Qu.:35.30  
##  Median :39.09  
##  Mean   :38.56  
##  3rd Qu.:41.75  
##  Max.   :64.82
str(pollution)
## 'data.frame':    576 obs. of  5 variables:
##  $ pm25     : num  9.77 9.99 10.69 11.34 12.12 ...
##  $ fips     : chr  "1003" "1027" "1033" "1049" ...
##  $ region   : Factor w/ 2 levels "east","west": 1 1 1 1 1 1 1 1 1 1 ...
##  $ longitude: num  -87.7 -85.8 -87.7 -85.8 -86 ...
##  $ latitude : num  30.6 33.3 34.7 34.5 34 ...

2.2 BoxPlot

summary(pollution$pm25)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.383   8.549  10.047   9.836  11.356  18.441
boxplot(pollution$pm25,col = "blue")
abline(h = 12)

boxplot(pm25~region, data = pollution, col = "red")

2.3 Histogram

hist(pollution$pm25, col = "green")
rug(pollution$pm25)

hist(pollution$pm25, col = "green", breaks = 100)
rug(pollution$pm25)
abline(v = 12, lwd = 2)

par(mfrow = c(2,1), mar = c(4,4,2,1))
hist(subset(pollution, region == "east")$pm25, col = "green")
abline(v = 12, lwd = 2)
hist(subset(pollution, region == "west")$pm25, col = "red")
abline(v = 12, lwd = 2)

2.4 BarPlot

barplot(table(pollution$region), col = "wheat", main = "Number of Counties in Each Region")

2.5 ScatterPlot

with(pollution, plot(latitude, pm25, col = region))
abline(h = 12, lwd = 2, lty = 2)

2.6 Maps

library("maps")
## 
## Attaching package: 'maps'
## The following object is masked from 'package:plyr':
## 
##     ozone
library("mapdata")
map("county")
colpm25 <- ifelse(pollution$pm25 > 12, 2, 4)
points(x = pollution$longitude,y = pollution$latitude, col = colpm25,cex = 1, type = "p", pch = 16)

3 基本作图

  • 图形
  • 横纵轴文字
  • 标题
  • 横纵轴的长度
  • 颜色
  • 形状大小
  • 图例

3.1 Data

library(MASS)
head(birthwt, 4)
##    low age lwt race smoke ptl ht ui ftv  bwt
## 85   0  19 182    2     0   0  0  1   0 2523
## 86   0  33 155    3     0   0  0  0   3 2551
## 87   0  20 105    1     1   0  0  0   1 2557
## 88   0  21 108    1     1   0  0  1   2 2594
colnames(birthwt) <- 
  c("birthwt.below.2500", 
    "mother.age", "mother.weight",  
    "race", "mother.smokes", "previous.prem.labor",
    "hypertension", "uterine.irr", "physician.visits", 
    "birthwt.grams")
birthwt$mother.smokes <- as.factor(birthwt$mother.smokes)
levels(birthwt$mother.smokes)
## [1] "0" "1"

3.2 plot - one

plot(birthwt$mother.age)

hist(birthwt$mother.age)

plot(birthwt$mother.age)
grid()

  • ?par
with(birthwt, plot (mother.age, birthwt.grams, 
xlab = "mother.age", 
ylab = "birthwt.grams",
col = mother.smokes,
type = 'p', # 点
pch = 19, # 实心点
cex = 0.7)) # 点的大小
abline(h = 2500)
legend("bottomright", c("Non-Smoking","Smoking"), col=c(1,2), pch=19)

type命令 描述
“p” points
“l” lines
“b” both points and lines
“c” empty points joined by lines
“o” overplotted points and lines
“s” and “S” stair steps
“h” histogram-like vertical lines
“n” does not produce any points or lines
  • pch

图例:legend(x, y, labels, pch = c(), lty = c(), lwd = c(), col = c())

  • lty, lwd都是控制线的参数

  • xlim = c(lower, upper) and ylim = c(lower, upper)

更多内容参考Quick-R.

plot(birthwt$mother.smokes)

  • 尝试barplot
  • plot(x, ...) 的作图行为因 class(x) 而变
plot(birthwt$mother.smokes, 
     main = "Smoking Distribution", 
     xlab = "mother.smoke", 
     ylab = "count",
     col = "lightblue")

3.3 plot - two

with(birthwt, 
     plot(mother.smokes, 
          birthwt.grams, 
     xlab = "mother.smokes", 
     ylab = "birthwt.grams"))

with(birthwt, 
     plot(physician.visits, 
          birthwt.grams,
     xlab = "physician.visits", 
     ylab = "birthwt.grams",
     col = 'lightblue'))

with(birthwt,      plot(as.factor(physician.visits),     birthwt.grams,
     xlab = "physician.visits", 
     ylab = "birthwt.grams",
     col = 'lightblue'))

尝试boxplot

  • png
png(file= "myplot.png", bg = "white", res = 120)
  with(birthwt,      
       plot(as.factor(physician.visits),     
            birthwt.grams,
            xlab = "physician.visits", 
            ylab = "birthwt.grams",
            col = 'lightblue'))
dev.off()
  • pdf
  • jpeg
  • eps
  • 工作目录
    setwd()
    getwd()

3.4 plot - more

data("iris")
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
plot(iris[, 1:4])

  • try pairs()
library('corrplot')
## Warning: package 'corrplot' was built under R version 3.4.2
## corrplot 0.84 loaded
corrplot(cor(iris[,1:4]))

4 高级作图

4.1 ggplot2 v.s. plot

  • plot
with(birthwt, 
plot(mother.age, 
     birthwt.grams))

  • plot
with(birthwt, 
plot(mother.age, 
     birthwt.grams))

  • qplot (quick plot)
library(ggplot2)
qplot(x = mother.age, y = birthwt.grams, data = birthwt)

  • plot
with(birthwt, 
plot(mother.age, 
     birthwt.grams))

  • qplot
birthwt$race <- as.factor(birthwt$race)
qplot(x = mother.age, y = birthwt.grams, data = birthwt, color = race, shape = race, # 1 = white, 2 = black, 3 = other
xlab="mother.age",ylab="birthwt.grams") 

4.2 ggplot2

  • 核心: 面向图形[对象]
  • 数据(data)
  • 图形映射(mapping)
  • 几何对象(geom)
  • 统计变换(stats)
  • 分面(facet)
  • 标尺(scale)
  • 坐标系(coord)
  • 主题(theme)
  • 存储(save)

4.2.1 标准语法

p <- ggplot(
  data, 
  mapping = aes(x, y, ...) # 图形对象
p <- p + layer( # 图形对象加图层
  geom = "...",
  geom_params = list(...),
  stat = "...",
  stat_params = list(...))
p

4.2.2 简化语法

ggplot(
   data,
   aes(x, y, <other aesthetics>)) +
 geom_XXX() +
 stat_XXX()

4.3 ggplot2 - hist

str(mpg)
## Classes 'tbl_df', 'tbl' and 'data.frame':    234 obs. of  11 variables:
##  $ manufacturer: chr  "audi" "audi" "audi" "audi" ...
##  $ model       : chr  "a4" "a4" "a4" "a4" ...
##  $ displ       : num  1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int  1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int  4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : chr  "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
##  $ drv         : chr  "f" "f" "f" "f" ...
##  $ cty         : int  18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int  29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : chr  "p" "p" "p" "p" ...
##  $ class       : chr  "compact" "compact" "compact" "compact" ...
ggplot(mpg, 
      aes(x = drv)) + 
  geom_bar()

ggplot(mpg, 
      aes(x = drv, 
          fill = as.factor(cyl))) + 
  geom_bar()

ggplot(mpg, 
      aes(x = drv, 
          fill = as.factor(cyl))) + 
  geom_bar()

ggplot(mpg, 
      aes(x = drv, 
          fill = as.factor(cyl))) + 
  geom_bar() + 
  coord_flip()

ggplot(mpg, 
       aes(x = drv, 
           fill = as.factor(cyl))) + 
  geom_bar(position='dodge')

library(plyr)
mpg.avg <- ddply(
 mpg, 
 c('drv', 'cyl'), 
 summarize, 
 avg.hwy = mean(hwy))
mpg.avg
##   drv cyl  avg.hwy
## 1   4   4 24.60870
## 2   4   6 19.50000
## 3   4   8 16.35417
## 4   f   4 30.46552
## 5   f   5 28.75000
## 6   f   6 25.06977
## 7   f   8 25.00000
## 8   r   6 25.25000
## 9   r   8 20.19048
  • 默认: stat = ‘count’
ggplot(mpg,
aes(x = drv, fill = as.factor(cyl))) + geom_bar(position='dodge', 
stat = 'count')

  • 感兴趣的统计量:stat=‘identity’
ggplot(mpg.avg, 
aes(x=drv, fill = as.factor(cyl), y = avg.hwy)) + geom_bar(position='dodge', stat='identity')

4.4 ggplot2 - scatter

  • argument: user dependent
ggplot(mpg, 
aes(x=displ, y=hwy)) + 
geom_point(color='blue', 
           size=2.5)

  • mapping: data dependent
ggplot(mpg, 
aes(x=displ, y=hwy, color=class)) + 
  
geom_point(size=2.5)

4.5 ggplot2 - facet

ggplot(mpg, aes(x=displ, y=hwy)) + geom_point() + facet_wrap("class")

ggplot(mpg, aes(x=displ, y=hwy)) + geom_point() + 
facet_grid(drv ~ cyl)

  • 多数据源
ggplot(
  mpg, 
  aes(x=displ, y=hwy)) + geom_point() + 
facet_grid(drv ~ cyl)

mpg.no.facet <- subset(mpg, 
  select = c('displ', 'hwy'))
ggplot(mpg, aes(x=displ, y=hwy)) + geom_point() +  
facet_grid(drv ~ cyl) + geom_point(data = mpg.no.facet, color = 'grey', size = 1)

4.6 ggplot2 - line

library(plyr)
mpg.avg <- ddply(
 mpg, 
 c('drv', 'cyl'), 
 summarize, 
 avg.hwy = mean(hwy))
mpg.avg
##   drv cyl  avg.hwy
## 1   4   4 24.60870
## 2   4   6 19.50000
## 3   4   8 16.35417
## 4   f   4 30.46552
## 5   f   5 28.75000
## 6   f   6 25.06977
## 7   f   8 25.00000
## 8   r   6 25.25000
## 9   r   8 20.19048
ggplot(
  mpg.avg,
  aes(x=drv, y=avg.hwy,
  color=as.factor(cyl), group = cyl)) + 
geom_line() + 
geom_point(size=4)

ggplot(mpg.avg, 
aes(x=drv, fill = as.factor(cyl), y = avg.hwy)) + geom_bar(position='dodge', stat='identity')

ggplot(
  mpg.avg,
  aes(x=drv, y=avg.hwy,
  color=as.factor(cyl), group = cyl)) + 
geom_line() + 
geom_point(size=4)

4.7 ggplot2 - label

mpg.means <- ddply(mpg, 'class', summarize, avg.cty = mean(cty))
mpg.means
##        class  avg.cty
## 1    2seater 15.40000
## 2    compact 20.12766
## 3    midsize 18.75610
## 4    minivan 15.81818
## 5     pickup 13.00000
## 6 subcompact 20.37143
## 7        suv 13.50000
mpg.plot <- ggplot(mpg.means, aes(x=class, y = avg.cty)) + geom_bar(stat='identity')
mpg.plot

mpg.plot <- mpg.plot + 
  labs(
    x = '', y = '英里每加仑',
    title='不同车型的城市里程') 
mpg.plot

mpg.plot + theme(
  text = element_text(size=22), 
  axis.text.x = element_text(
                 angle=45, 
                 vjust=1, 
                 hjust=1))

4.8 ggplot2 - save

  • 存图像
ggsave("./fig/plot.png", width = 8, height = 6)
ggsave("./fig/plot.pdf", width = 8, height = 6)
##    <U+683C><U+5F0F>
## 1  "eps"           
## 2  "ps"            
## 3  "tex(pictex)"   
## 4  "pdf"           
## 5  "jpeg"          
## 6  "tiff"          
## 7  "png"           
## 8  "bmp"           
## 9  "svg"           
## 10 "wmf"
  • 存图像数据
save(mpg.plot, file = "plot.rdata") 

5 交互作图

# To ensure plotly is installed correctly, try loading the package and creating this example by pasting the code inside your R console.
library(plotly)
plot_ly(z = ~volcano)
## No trace type specified:
##   Based on info supplied, a 'heatmap' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#heatmap
plot_ly(z = ~volcano, type = "surface")
data(gapminder, package = "gapminder")
gg <- ggplot(gapminder, aes(gdpPercap, lifeExp, color = continent)) +
  geom_point(aes(size = pop, frame = year, ids = country)) +
  scale_x_log10()
## Warning: Ignoring unknown aesthetics: frame, ids
ggplotly(gg)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

6 作业

首先,安装swirl包如下:

install.packages("swirl")

其次,启动swirl包如下:

# 载入swirl
library("swirl")

# 启动swirl
swirl()

这里会出现如下界面:

输入swirl()后,会继续出现

输入你自己的名字,然后会进入课程选择

好了,第三天作业,大家能完成6,7,15.