商务大数据智能分析 之 R2
Lecturer : 申 旌 周
jingzhou_shen@stu.xjtu.edu.cn
Instructor : 常 象 宇
xiangyuchang@xjtu.edu.cn
2017年3月12日
命令 | 描述 |
---|---|
+,-,*,\ |
加, 减, 乘, 除 |
^ |
取幂 |
%% |
取余 |
round() |
四舍五入 |
floor() |
向下取整 |
ceiling() |
向上取整 |
abs() |
绝对值 |
sqrt() |
平方根 |
log() |
对数 |
exp() |
指数 |
( ) |
改变运算优先级 |
&,|,!
22 %% 6
[1] 4
floor(22/6)
[1] 3
sqrt(22/6)
[1] 1.914854
log(22/6)
[1] 1.299283
<-
=
<<-
<-
双向赋值asg <- 2
asg
[1] 2
3 -> asg
asg
[1] 3
=
单向传值asg = 1
asg
[1] 1
# 这是错误的写法
1 = asg
=
函数参数传值
Usage
matrix(data = NA, nrow = 1, ncol = 1, ...)
matrix(1:6, 3)
[,1] [,2]
[1,] 1 4
[2,] 2 5
[3,] 3 6
=
函数参数传值 如何生成一个
2 × 3的矩阵?
=
函数参数传值 GOOD
matrix(1:6, ncol = 3)
[,1] [,2] [,3]
[1,] 1 3 5
[2,] 2 4 6
BAD
matrix(1:6, ncol <- 3)
[,1] [,2]
[1,] 1 4
[2,] 2 5
[3,] 3 6
=
函数参数传值 BAD函数参数传值(续)
ls() # 显示当前环境变量
[1] "asg" "ncol"
mean(x = 1:3)
[1] 2
=
函数参数传值 BAD函数参数传值(续)
ls()
[1] "asg" "ncol"
mean(x <- 1:3)
[1] 2
ls()
[1] "asg" "ncol" "x"
<-
与 =
小结除函数参数传值外,
=
可被替换为<-
, 反之则不然, 如:system.time(A <- matrix(1:6, 3))
中只能用<-
<<-
表示给上一层环境中的变量赋值
建议:进行赋值运算时,使用 <-
而非 =
.
原子型对象
is.logical(T)
[1] TRUE
is.logical(1)
[1] FALSE
原子型对象
typeof(7)
[1] "double"
is.numeric(7)
[1] TRUE
原子型对象
is.integer(7)
[1] FALSE
is.character("K")
[1] TRUE
非原子型对象
非原子型对象
vector(mode = "character", length = 3)
[1] "" "" ""
v <- c(0, 1, -2)
typeof(v)
[1] "double"
class(v)
[1] "numeric"
length(v)
[1] 3
as.character(v)
[1] "0" "1" "-2"
as.logical(v)
[1] FALSE TRUE TRUE
class(v)
[1] "numeric"
class(as.logical(v))
[1] "logical"
v[c(2, 3)]
[1] 1 -2
v[2:3]
[1] 1 -2
v[-2]
[1] 0 -2
v[which(v < 1)]
[1] 0 -2
names(v)
NULL
names(v) <- c("张三", "李四", "王五")
names(v)
[1] "张三" "李四" "王五"
v[c("王五", "李四")]
王五 李四
-2 1
t <- c(2, 1, 3)
(v + t)/2
张三 李四 王五
1.0 1.0 0.5
t > v
张三 李四 王五
TRUE FALSE TRUE
summary(v)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-2.0000 -1.0000 0.0000 -0.3333 0.5000 1.0000
命令 | 描述 |
---|---|
sum(v) |
对 v 的所有元素求和 |
mean(v) |
求 v 的均值 |
median(v) |
求 v 的中位数 |
min(v), max(v) |
求v 的最值 |
sd(v), var(v) |
求v 的标准差和方差 |
length(v) |
v 中的元素个数 |
pmax(v1, v2), pmin(v1, v2) |
示例: pmax(quiz1, quiz2) 返回每个学生两次测验中的较高成绩 |
sort(v) |
对v 排序 |
order(v) |
返回 v 排序后的索引 |
unique(v) |
对 v 去重 |
summary(v) |
“五数"概述 |
any(v) |
是否v 中存在一个元素为TRUE |
all(v) |
是否v 中所有元素均为TRUE |
非原子型对象
M <- matrix(1:9, nrow = 3, ncol = 3)
print(M)
[,1] [,2] [,3]
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9
length(M)
[1] 9
dim(M)
[1] 3 3
cbind(1:3, 4:6)
[,1] [,2]
[1,] 1 4
[2,] 2 5
[3,] 3 6
rbind(1:3, 4:6)
[,1] [,2] [,3]
[1,] 1 2 3
[2,] 4 5 6
attributes(M)
$dim
[1] 3 3
vec <- 1:9
vec
[1] 1 2 3 4 5 6 7 8 9
attributes(vec)
NULL
dim(vec) <- c(3, 3)
vec
[,1] [,2] [,3]
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9
M[2,3]
[1] 8
M[1, ]
[1] 1 4 7
M[, 2]
[1] 4 5 6
Q <- matrix(rep(10, 9), 3, 3)
M * Q
[,1] [,2] [,3]
[1,] 10 40 70
[2,] 20 50 80
[3,] 30 60 90
M %*% Q
[,1] [,2] [,3]
[1,] 120 120 120
[2,] 150 150 150
[3,] 180 180 180
colMeans(M)
[1] 2 5 8
colSums(M)
[1] 6 15 24
colnames(M)
NULL
rownames(M)
NULL
非原子型对象
nutrition <- data.frame(fruit = c("apples", "pears", "bananas"), calories = c(52, 57, 92))
head(nutrition, 3)
fruit calories
1 apples 52
2 pears 57
3 bananas 92
ncol(nutrition)
[1] 2
nrow(nutrition)
[1] 3
names(nutrition)[1] <- "type"
attributes(nutrition)
$names
[1] "type" "calories"
$row.names
[1] 1 2 3
$class
[1] "data.frame"
colnames(nutrition)
[1] "type" "calories"
cbind(nutrition, nutrition)
type calories type calories
1 apples 52 apples 52
2 pears 57 pears 57
3 bananas 92 bananas 92
rbind(nutrition, nutrition)
type calories
1 apples 52
2 pears 57
3 bananas 92
4 apples 52
5 pears 57
6 bananas 92
str(nutrition)
'data.frame': 3 obs. of 2 variables:
$ type : Factor w/ 3 levels "apples","bananas",..: 1 3 2
$ calories: num 52 57 92
summary(nutrition)
type calories
apples :1 Min. :52.0
bananas:1 1st Qu.:54.5
pears :1 Median :57.0
Mean :67.0
3rd Qu.:74.5
Max. :92.0
nutrition[which(nutrition$type == "apples"), ]
type calories
1 apples 52
nutrition$calories # 返回结果为向量
[1] 52 57 92
nutrition[["calories"]] # 返回结果为向量
[1] 52 57 92
nutrition["calories"] # 返回结果为数据框
calories
1 52
2 57
3 92
calorie <- nutrition[["calories"]]
max(calorie)
[1] 92
subset(nutrition, select = "type", subset = calories > 80)
type
3 bananas
row.names(nutrition)
[1] "1" "2" "3"
rownames(nutrition)
[1] "1" "2" "3"
非原子型对象
list(
c("张三", "李四", "王五", "刘六"),
c(183, 168, 177, 167),
c("男", "女", "男"))
[[1]]
[1] "张三" "李四" "王五" "刘六"
[[2]]
[1] 183 168 177 167
[[3]]
[1] "男" "女" "男"
L <- list(
name = c("张三", "李四", "王五", "刘六"),
height = c(183, 168, 177, 167),
gender = c("男", "女", "男"))
L
$name
[1] "张三" "李四" "王五" "刘六"
$height
[1] 183 168 177 167
$gender
[1] "男" "女" "男"
str(L)
List of 3
$ name : chr [1:4] "张三" "李四" "王五" "刘六"
$ height: num [1:4] 183 168 177 167
$ gender: chr [1:3] "男" "女" "男"
Doraemon.bag <- list(
char = "K",
num = c(2, 2, 6),
mat = matrix(6, 2, 2),
df = data.frame(A = 1:3, I = 4:6),
list = list(D = 1, M = 9)
)
str(Doraemon.bag)
List of 5
$ char: chr "K"
$ num : num [1:3] 2 2 6
$ mat : num [1:2, 1:2] 6 6 6 6
$ df :'data.frame': 3 obs. of 2 variables:
..$ A: int [1:3] 1 2 3
..$ I: int [1:3] 4 5 6
$ list:List of 2
..$ D: num 1
..$ M: num 9
L$name
[1] "张三" "李四" "王五" "刘六"
L[["name"]]
[1] "张三" "李四" "王五" "刘六"
L["name"]
$name
[1] "张三" "李四" "王五" "刘六"
c(is.vector(L$name), is.vector(L[["name"]]),
is.list(L["name"]))
[1] TRUE TRUE TRUE
f <- factor(c("yes", "yes", "no", "no", "yes"))
f
[1] yes yes no no yes
Levels: no yes
attributes(f)
$levels
[1] "no" "yes"
$class
[1] "factor"
f <- factor(
c("yes", "yes", "no", "no", "yes"),
levels = c("yes", "no"))
f
[1] yes yes no no yes
Levels: yes no
attr(f, "levels") <- c("y", "n")
unclass(f)
[1] 1 1 2 2 1
attr(,"levels")
[1] "y" "n"
缺失值
q <- c(1, NaN, NA, 4)
class(q)
[1] "numeric"
is.nan(q)
[1] FALSE TRUE FALSE FALSE
is.na(q)
[1] FALSE TRUE TRUE FALSE
缺失值
mean(q)
[1] NaN
mean(q, na.rm = T)
[1] 2.5
attributes()
访问data <- read.csv("./data/data.csv", header = TRUE)
data
姓名 性别 年龄
1 张三 女 11
2 李四 女 12
3 王五 男 12
class(data)
[1] "data.frame"
write.csv(data, file = "./data/dataout.csv")
s <- 1:5
ifelse(s > 3, yes = "Good", no = "Fair")
[1] "Fair" "Fair" "Fair" "Good" "Good"
a <- 1
if(a > 2){
b <- 1
}else{
b <- 0
}
b
[1] 0
a <- -1
if(a > 2){
b <- 1
}else if(a < 0){
b <- 1
}else{
b <- 0
}
b
[1] 1
for(i in 1:3){
print(i)
}
[1] 1
[1] 2
[1] 3
M <- matrix(1:2, 2, 3)
M
[,1] [,2] [,3]
[1,] 1 1 1
[2,] 2 2 2
numeric(3)
[1] 0 0 0
col.sum <- numeric(ncol(M))
for(j in 1:ncol(M)){
for(i in 1:nrow(M)){
col.sum[j] <-
col.sum[j] + M[i, j]
}
}
col.sum
[1] 3 3 3
M
[,1] [,2] [,3]
[1,] 1 1 1
[2,] 2 2 2
col.sum <- numeric(ncol(M))
for(i in 1:nrow(M)){
col.sum <- col.sum + M[i, ]
}
col.sum
[1] 3 3 3
apply(M, 2, sum)
[1] 3 3 3
apply(M, 1, sum)
[1] 3 6
L <- list(
a = matrix(1:3, 1),
b = data.frame(
char = c("A", "B"),
logic = c(T, F)),
c = list(
1:3,
"T",
matrix(1:6, 2, 3)))
L
$a
[,1] [,2] [,3]
[1,] 1 2 3
$b
char logic
1 A TRUE
2 B FALSE
$c
$c[[1]]
[1] 1 2 3
$c[[2]]
[1] "T"
$c[[3]]
[,1] [,2] [,3]
[1,] 1 3 5
[2,] 2 4 6
lapply(L, class)
$a
[1] "matrix"
$b
[1] "data.frame"
$c
[1] "list"
sapply(L, class)
a b c
"matrix" "data.frame" "list"
mapply(rep, 1:6, 6:1)
[[1]]
[1] 1 1 1 1 1 1
[[2]]
[1] 2 2 2 2 2
[[3]]
[1] 3 3 3 3
[[4]]
[1] 4 4 4
[[5]]
[1] 5 5
[[6]]
[1] 6
set.seed(6)
n <- 9
bike <- data.frame(
orgn = sample(c("SE", "S", "N"), n, replace = T),
dest = sample(c("SE", "S", "N"), n, replace = T),
flux = round(rnorm(n, 18)))
bike
orgn dest flux
1 S SE 17
2 N S 20
3 SE N 17
4 S SE 19
5 N SE 18
6 N N 17
7 N SE 18
8 N S 20
9 S N 17
table(bike$orgn, bike$dest)
N S SE
N 1 2 2
S 1 0 2
SE 1 0 0
tapply(
bike$flux,
bike[c("orgn", "dest")],
mean)
dest
orgn N S SE
N 17 20 18
S 17 NA 18
SE 17 NA NA
命令 | 描述 |
---|---|
apply(X, MARGIN, FUN) |
通过对一个数组或矩阵X 的指定MARGIN 应用FUN 来得到一个向量/数组/列表 |
lapply(X, FUN) |
通过对一个列表X 的元素应用FUN 来得到一个列表 |
sapply(X, FUN) |
lapply 的简化版本返回一个向量/数组而不是列表 |
mapply(FUN) |
lapply 的多变元版本 |
tapply(X, INDEX, FUN) |
通过对INDEX 制定的因子组合应用FUN 函数来得到一个表格 |
i <- 0
while(i < 3){
print(i)
i <- i + 1
}
[1] 0
[1] 1
[1] 2
i <- 0
repeat{
print(i)
ifelse(i > 1, yes = break, no = i <- i + 1)
}
[1] 0
[1] 1
[1] 2
for(i in 1:10){
if(i <= 7){
next
}
print(i)
i <- i + 1
}
[1] 8
[1] 9
[1] 10
square <- function(x) {
return(x^2)
}
square(11)
[1] 121
square <- function(x) {
x^2
}
square(11)
[1] 121
Thank you