商务大数据智能分析 之 R3
Lecturer : 申 旌 周
jingzhou_shen@stu.xjtu.edu.cn
Instructor : 常 象 宇
xiangyuchang@xjtu.edu.cn
2017年3月19日
函数
作图
rsummary <- function(x) {
return(
list(
mean = mean(x),
sd = sd(x),
quantiles = quantile(x)))
}
x <- rnorm(100, mean = 6, sd = 1)
rsummary(x)
$mean
[1] 5.895624
$sd
[1] 1.016297
$quantiles
0% 25% 50% 75% 100%
3.518802 5.106896 5.869846 6.557136 8.789925
calGrade <- function(x, base = 59) {
s <- x + base
if(s >= 90) {
grade <- "优"
} else if(s >= 80) {
grade <- "良"
} else if(s >= 70) {
grade <- "中"
} else if(x >= 60){
grade <- "及格"
}else{
grade <- "仍需努力"
}
grade
}
scores <- c(40, 20, 27, 31, 0, 0)
calGrade(scores) # 注意与下面语句的区别
[1] "优"
lapply(scores, calGrade) # 复习lapply
[[1]]
[1] "优"
[[2]]
[1] "中"
[[3]]
[1] "良"
[[4]]
[1] "优"
[[5]]
[1] "仍需努力"
[[6]]
[1] "仍需努力"
formals
formals(calGrade)
$x
$base
[1] 59
formals(plot)
$x
$y
$...
(M <- matrix(1:6, nrow = 2)) # 精确匹配
[,1] [,2] [,3]
[1,] 1 3 5
[2,] 2 4 6
(M <- matrix(1:6, nr = 2)) # 部分匹配
[,1] [,2] [,3]
[1,] 1 3 5
[2,] 2 4 6
(M <- matrix(1:6, 2)) # 位置匹配
[,1] [,2] [,3]
[1,] 1 3 5
[2,] 2 4 6
...
quantile(x, probs = seq(0, 1, 0.25), ...)
apply(X, MARGIN, FUN, ...)
set.seed(1)
(x <- matrix(rnorm(12), 3))
[,1] [,2] [,3] [,4]
[1,] -0.6264538 1.5952808 0.4874291 -0.3053884
[2,] 0.1836433 0.3295078 0.7383247 1.5117812
[3,] -0.8356286 -0.8204684 0.5757814 0.3898432
apply(x, 2, quantile) # matrix
[,1] [,2] [,3] [,4]
0% -0.8356286 -0.8204684 0.4874291 -0.30538839
25% -0.7310412 -0.2454803 0.5316052 0.04222742
50% -0.6264538 0.3295078 0.5757814 0.38984324
75% -0.2214052 0.9623943 0.6570530 0.95081220
100% 0.1836433 1.5952808 0.7383247 1.51178117
...
quantile(x, probs = seq(0, 1, 0.25), ...)
apply(X, MARGIN, FUN, ...)
set.seed(1)
(x <- matrix(rnorm(12), 3))
[,1] [,2] [,3] [,4]
[1,] -0.6264538 1.5952808 0.4874291 -0.3053884
[2,] 0.1836433 0.3295078 0.7383247 1.5117812
[3,] -0.8356286 -0.8204684 0.5757814 0.3898432
apply(x, 2, quantile, probs = c(.25, .75))
[,1] [,2] [,3] [,4]
25% -0.7310412 -0.2454803 0.5316052 0.04222742
75% -0.2214052 0.9623943 0.6570530 0.95081220
...
str(paste)
function (..., sep = " ", collapse = NULL)
paste("Hi", " There", sep = "!")
[1] "Hi! There"
paste("Hi", " There", se = "!")
[1] "Hi There !"
...
x <- 1:30
plot(x, log(x))
...
my.plot <-
function(x, y, type = "l", ...){
plot(x, y, type = type, ...)
} # 自定义函数
my.plot(x, log(x))
source("./MyPlotEncap.R")
my.plot.encap(x, log(x))
library(MASS)
str(birthwt)
'data.frame': 189 obs. of 10 variables:
$ low : int 0 0 0 0 0 0 0 0 0 0 ...
$ age : int 19 33 20 21 18 21 22 17 29 26 ...
$ lwt : int 182 155 105 108 107 124 118 103 123 113 ...
$ race : int 2 3 1 1 1 3 1 3 1 1 ...
$ smoke: int 0 0 1 1 1 0 0 0 1 1 ...
$ ptl : int 0 0 0 0 0 0 0 0 0 0 ...
$ ht : int 0 0 0 0 0 0 0 0 0 0 ...
$ ui : int 1 0 0 1 1 0 0 0 0 0 ...
$ ftv : int 0 3 1 2 0 0 1 1 1 0 ...
$ bwt : int 2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
head(birthwt, 4)
low age lwt race smoke ptl ht ui ftv bwt
85 0 19 182 2 0 0 0 1 0 2523
86 0 33 155 3 0 0 0 0 3 2551
87 0 20 105 1 1 0 0 0 1 2557
88 0 21 108 1 1 0 0 1 2 2594
colnames(birthwt) <-
c("birthwt.below.2500",
"mother.age", "mother.weight",
"race", "mother.smokes", "previous.prem.labor",
"hypertension", "uterine.irr", "physician.visits",
"birthwt.grams")
birthwt$mother.smokes <- as.factor(birthwt$mother.smokes)
levels(birthwt$mother.smokes)
[1] "0" "1"
plot(birthwt$mother.age)
hist(birthwt$mother.age)
with(birthwt, plot (mother.age, birthwt.grams,
xlab = "孕期妈妈年龄",
ylab = "新生儿体重(克)",
col = mother.smokes,
pch = 19, # 实心点
cex = 0.7)) # 点的大小
abline(h = 2500)
legend("bottomright", c("不抽烟","抽烟"), col=c(1,2), pch=19)
plot(birthwt$mother.smokes)
plot(x, ...)
的作图行为因 class(x)
而变plot(birthwt$mother.smokes,
main = "孕期妈妈抽烟分布图",
xlab = "孕期妈妈是否抽烟",
ylab = "数量",
col = "lightblue")
par(mfrow = c(1, 1))
with(birthwt,
plot(mother.smokes,
birthwt.grams,
xlab = "孕期妈妈是否抽烟",
ylab = "新生儿体重(克)"))
with(birthwt,
plot(physician.visits,
birthwt.grams,
xlab = "就诊数量",
ylab = "新生儿体重(克)",
col = 'lightblue'))
with(birthwt, plot(as.factor(physician.visits), birthwt.grams,
xlab = "就诊数量",
ylab = "新生儿体重(克)",
col = 'lightblue'))
png(file= "scatter.png", bg = "white", res = 120)
with(birthwt,
plot(as.factor(physician.visits),
birthwt.grams,
xlab = "就诊数量",
ylab = "新生儿体重(克)",
col = 'lightblue'))
dev.off()
setwd()
getwd()
with(birthwt,
plot(mother.age,
birthwt.grams))
library(ggplot2)
qplot(x = mother.age, y = birthwt.grams, data = birthwt)
birthwt$race <- as.factor(birthwt$race)
qplot(x = mother.age, y = birthwt.grams, data = birthwt, color = race, shape = race, # 1 = white, 2 = black, 3 = other
xlab="孕期妈妈年龄",ylab="新生儿体重(克)")
p <- ggplot(
data,
mapping = aes(x, y, ...) # 图形对象
p <- p + layer( # 图形对象加图层
geom = "...",
geom_params = list(...),
stat = "...",
stat_params = list(...))
p
ggplot(
data,
aes(x, y, <other aesthetics>)) +
geom_XXX() +
stat_XXX()
ggplot(
data,
aes(x, y,
<other aesthetics>)) +
geom_XXX() +
stat_XXX()
str(mpg)
Classes 'tbl_df', 'tbl' and 'data.frame': 234 obs. of 11 variables:
$ manufacturer: chr "audi" "audi" "audi" "audi" ...
$ model : chr "a4" "a4" "a4" "a4" ...
$ displ : num 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
$ year : int 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
$ cyl : int 4 4 4 4 6 6 6 4 4 4 ...
$ trans : chr "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
$ drv : chr "f" "f" "f" "f" ...
$ cty : int 18 21 20 21 16 18 18 18 16 20 ...
$ hwy : int 29 29 31 30 26 26 27 26 25 28 ...
$ fl : chr "p" "p" "p" "p" ...
$ class : chr "compact" "compact" "compact" "compact" ...
ggplot(mpg,
aes(x = drv)) +
geom_bar()
ggplot(mpg,
aes(x = drv,
fill = as.factor(cyl))) +
geom_bar()
ggplot(mpg,
aes(x = drv,
fill = as.factor(cyl))) +
geom_bar()
ggplot(mpg,
aes(x = drv,
fill = as.factor(cyl))) +
geom_bar() +
coord_flip()
ggplot(mpg,
aes(x = drv,
fill = as.factor(cyl))) +
geom_bar(position='dodge')
library(plyr)
mpg.avg <- ddply(
mpg,
c('drv', 'cyl'),
summarize,
avg.hwy = mean(hwy))
mpg.avg
drv cyl avg.hwy
1 4 4 24.60870
2 4 6 19.50000
3 4 8 16.35417
4 f 4 30.46552
5 f 5 28.75000
6 f 6 25.06977
7 f 8 25.00000
8 r 6 25.25000
9 r 8 20.19048
ggplot(mpg,
aes(x = drv, fill = as.factor(cyl))) + geom_bar(position='dodge',
stat = 'count')
ggplot(mpg.avg,
aes(x=drv, fill = as.factor(cyl), y = avg.hwy)) + geom_bar(position='dodge', stat='identity')
ggplot(mpg,
aes(x=displ, y=hwy)) +
geom_point(color='blue',
size=2.5)
ggplot(mpg,
aes(x=displ, y=hwy, color=class)) +
geom_point(size=2.5)
ggplot(mpg, aes(x=displ, y=hwy)) + geom_point() + facet_wrap("class")
ggplot(
mpg,
aes(x=displ, y=hwy)) + geom_point() +
facet_grid(drv ~ cyl)
mpg.no.facet <- subset(mpg,
select = c('displ', 'hwy'))
ggplot(mpg, aes(x=displ, y=hwy)) + geom_point() +
facet_grid(drv ~ cyl) + geom_point(data = mpg.no.facet, color = 'grey', size = 1)
library(plyr)
mpg.avg <- ddply(
mpg,
c('drv', 'cyl'),
summarize,
avg.hwy = mean(hwy))
mpg.avg
drv cyl avg.hwy
1 4 4 24.60870
2 4 6 19.50000
3 4 8 16.35417
4 f 4 30.46552
5 f 5 28.75000
6 f 6 25.06977
7 f 8 25.00000
8 r 6 25.25000
9 r 8 20.19048
ggplot(
mpg.avg,
aes(x=drv, y=avg.hwy,
color=as.factor(cyl), group = cyl)) +
geom_line() +
geom_point(size=4)
ggplot(mpg.avg,
aes(x=drv, fill = as.factor(cyl), y = avg.hwy)) + geom_bar(position='dodge', stat='identity') # 对比用
mpg.means <- ddply(mpg, 'class', summarize, avg.cty = mean(cty))
mpg.means
class avg.cty
1 2seater 15.40000
2 compact 20.12766
3 midsize 18.75610
4 minivan 15.81818
5 pickup 13.00000
6 subcompact 20.37143
7 suv 13.50000
mpg.plot <- ggplot(mpg.means, aes(x=class, y = avg.cty)) + geom_bar(stat='identity')
mpg.plot
mpg.plot <- mpg.plot +
labs(
x = '', y = '英里每加仑',
title='不同车型的城市里程')
mpg.plot
mpg.plot + theme(
text = element_text(size=22),
axis.text.x = element_text(
angle=45,
vjust=1,
hjust=1))
ggsave("./fig/plot.png", width = 8, height = 6)
ggsave("./fig/plot.pdf", width = 8, height = 6)
格式
1 "eps"
2 "ps"
3 "tex(pictex)"
4 "pdf"
5 "jpeg"
6 "tiff"
7 "png"
8 "bmp"
9 "svg"
10 "wmf"
save(mpg.plot, file = "plot.rdata")
Thank you