商务大数据智能分析 之 R4
Lecturer : 申 旌 周
jingzhou_shen@stu.xjtu.edu.cn
Instructor : 常 象 宇
xiangyuchang@xjtu.edu.cn
2017年4月9日
Linear Model for Regression
Linear Model for Classification
library(MASS)
str(birthwt)
'data.frame': 189 obs. of 10 variables:
$ low : int 0 0 0 0 0 0 0 0 0 0 ...
$ age : int 19 33 20 21 18 21 22 17 29 26 ...
$ lwt : int 182 155 105 108 107 124 118 103 123 113 ...
$ race : int 2 3 1 1 1 3 1 3 1 1 ...
$ smoke: int 0 0 1 1 1 0 0 0 1 1 ...
$ ptl : int 0 0 0 0 0 0 0 0 0 0 ...
$ ht : int 0 0 0 0 0 0 0 0 0 0 ...
$ ui : int 1 0 0 1 1 0 0 0 0 0 ...
$ ftv : int 0 3 1 2 0 0 1 1 1 0 ...
$ bwt : int 2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
head(birthwt)
low age lwt race smoke ptl ht ui ftv bwt
85 0 19 182 2 0 0 0 1 0 2523
86 0 33 155 3 0 0 0 0 3 2551
87 0 20 105 1 1 0 0 0 1 2557
88 0 21 108 1 1 0 0 1 2 2594
89 0 18 107 1 1 0 0 1 0 2600
91 0 21 124 3 0 0 0 0 0 2622
colnames(birthwt) <-
c("birthwt.below.2500",
"mother.age", "mother.weight",
"race", "mother.smokes", "previous.prem.labor",
"hypertension", "uterine.irr", "physician.visits",
"birthwt.grams")
birthwt$race <- as.factor(birthwt$race)
(attr(birthwt$race, "levels") <- c("White", "Black", "Other"))
[1] "White" "Black" "Other"
birthwt$mother.smokes <- as.factor(birthwt$mother.smokes)
(attr(birthwt$mother.smokes, "levels") <- c("Nonsmoker", "Smoker"))
[1] "Nonsmoker" "Smoker"
lm(birthwt.grams ~ mother.age, data = birthwt)
Call:
lm(formula = birthwt.grams ~ mother.age, data = birthwt)
Coefficients:
(Intercept) mother.age
2655.74 12.43
library(ggplot2)
# birthwt.grams ~ mother.age
p <- ggplot(
birthwt,
aes(x = mother.age, y = birthwt.grams))
plot.simple <- p +
geom_point(aes(colour = race),
size = 4) +
stat_smooth(method = "lm") +
theme(text = element_text(size = 22))
plot.simple
slr <- lm(birthwt.grams ~ mother.age, data = birthwt)
attributes(slr)
$names
[1] "coefficients" "residuals" "effects" "rank"
[5] "fitted.values" "assign" "qr" "df.residual"
[9] "xlevels" "call" "terms" "model"
$class
[1] "lm"
summary(slr)
Call:
lm(formula = birthwt.grams ~ mother.age, data = birthwt)
Residuals:
Min 1Q Median 3Q Max
-2294.78 -517.63 10.51 530.80 1774.92
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2655.74 238.86 11.12 <2e-16 ***
mother.age 12.43 10.02 1.24 0.216
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 728.2 on 187 degrees of freedom
Multiple R-squared: 0.008157, Adjusted R-squared: 0.002853
F-statistic: 1.538 on 1 and 187 DF, p-value: 0.2165
mlr <- lm(birthwt.grams ~ mother.age + race, data = birthwt)
mlr
Call:
lm(formula = birthwt.grams ~ mother.age + race, data = birthwt)
Coefficients:
(Intercept) mother.age raceBlack raceOther
2949.979 6.288 -365.715 -285.466
coef(mlr)
(Intercept) mother.age raceBlack raceOther
2949.979032 6.287741 -365.715000 -285.465780
confint(mlr)
2.5 % 97.5 %
(Intercept) 2446.20259 3453.75547
mother.age -13.58435 26.15984
raceBlack -682.62810 -48.80190
raceOther -513.39407 -57.53749
intercepts <- c(
coef(mlr)["(Intercept)"],
coef(mlr)["(Intercept)"] + coef(mlr)["raceBlack"],
coef(mlr)["(Intercept)"] + coef(mlr)["raceOther"])
lines.df <- data.frame(
intercepts = intercepts, slopes = rep(coef(mlr)["mother.age"], 3),
race = levels(birthwt$race))
plot.mlr <-
ggplot(x = mother.age,
y = birthwt.grams,
color = race,
data = birthwt) +
geom_abline(
aes(
intercept = intercepts,
slope = slopes,
color = race),
data = lines.df) +
theme(text = element_text(size = 22))
plot.mlr
mlr.inter <- lm(birthwt.grams ~ mother.age * race, data = birthwt)
mlr.inter
Call:
lm(formula = birthwt.grams ~ mother.age * race, data = birthwt)
Coefficients:
(Intercept) mother.age raceBlack
2583.54 21.37 1022.79
raceOther mother.age:raceBlack mother.age:raceOther
326.05 -62.54 -26.03
plot.mlr.inter <-
qplot(x = mother.age,
y = birthwt.grams,
color = race,
data = birthwt) +
stat_smooth(
method = "lm",
se = FALSE,
fullrange = TRUE) +
theme(text = element_text(size = 22))
plot.mlr.inter
birthwt.grams ~ mother.age + race
plot.mlr
\[ \begin{equation}\large y = \text{Intercept}_{race} + \beta \times \text{age} \end{equation} \]
birthwt.grams ~ mother.age * race
plot.mlr.inter
\[ \begin{equation}\large y = \text{Intercept}_{race} + \beta_{race}\times \text{age} \end{equation} \]
summary(mlr)
Call:
lm(formula = birthwt.grams ~ mother.age + race, data = birthwt)
Residuals:
Min 1Q Median 3Q Max
-2131.57 -488.02 -1.16 521.87 1757.07
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2949.979 255.352 11.553 <2e-16 ***
mother.age 6.288 10.073 0.624 0.5332
raceBlack -365.715 160.636 -2.277 0.0240 *
raceOther -285.466 115.531 -2.471 0.0144 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 715.7 on 185 degrees of freedom
Multiple R-squared: 0.05217, Adjusted R-squared: 0.0368
F-statistic: 3.394 on 3 and 185 DF, p-value: 0.01909
summary(mlr.inter)
Call:
lm(formula = birthwt.grams ~ mother.age * race, data = birthwt)
Residuals:
Min 1Q Median 3Q Max
-2182.35 -474.23 13.48 523.86 1496.51
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2583.54 321.52 8.035 1.11e-13 ***
mother.age 21.37 12.89 1.658 0.0991 .
raceBlack 1022.79 694.21 1.473 0.1424
raceOther 326.05 545.30 0.598 0.5506
mother.age:raceBlack -62.54 30.67 -2.039 0.0429 *
mother.age:raceOther -26.03 23.20 -1.122 0.2633
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 710.7 on 183 degrees of freedom
Multiple R-squared: 0.07541, Adjusted R-squared: 0.05015
F-statistic: 2.985 on 5 and 183 DF, p-value: 0.01291
lm.full <- lm(birthwt.grams ~ ., data = birthwt)
bwd <- step(lm.full, direction = "backward", trace = 0)
lm.null <- lm(birthwt.grams ~ 1, data = birthwt)
fwd <- step(lm.null, direction = "forward", scope = ( ~ birthwt.below.2500 + mother.age + mother.weight + race + mother.smokes + previous.prem.labor + hypertension + uterine.irr + physician.visits), trace = 0)
summary(bwd)
Call:
lm(formula = birthwt.grams ~ birthwt.below.2500 + race + mother.smokes +
uterine.irr, data = birthwt)
Residuals:
Min 1Q Median 3Q Max
-1039.37 -338.51 30.49 298.19 1489.19
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3500.81 58.71 59.624 < 2e-16 ***
birthwt.below.2500 -1131.17 71.33 -15.857 < 2e-16 ***
raceBlack -206.36 97.38 -2.119 0.035434 *
raceOther -189.89 74.74 -2.541 0.011898 *
mother.smokesSmoker -157.30 70.29 -2.238 0.026442 *
uterine.irr -309.28 90.04 -3.435 0.000734 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 432 on 183 degrees of freedom
Multiple R-squared: 0.6584, Adjusted R-squared: 0.6491
F-statistic: 70.55 on 5 and 183 DF, p-value: < 2.2e-16
summary(fwd)
Call:
lm(formula = birthwt.grams ~ birthwt.below.2500 + uterine.irr +
race + mother.smokes, data = birthwt)
Residuals:
Min 1Q Median 3Q Max
-1039.37 -338.51 30.49 298.19 1489.19
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3500.81 58.71 59.624 < 2e-16 ***
birthwt.below.2500 -1131.17 71.33 -15.857 < 2e-16 ***
uterine.irr -309.28 90.04 -3.435 0.000734 ***
raceBlack -206.36 97.38 -2.119 0.035434 *
raceOther -189.89 74.74 -2.541 0.011898 *
mother.smokesSmoker -157.30 70.29 -2.238 0.026442 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 432 on 183 degrees of freedom
Multiple R-squared: 0.6584, Adjusted R-squared: 0.6491
F-statistic: 70.55 on 5 and 183 DF, p-value: < 2.2e-16
wise <- step(lm.full, trace = 0); summary(wise)
Call:
lm(formula = birthwt.grams ~ birthwt.below.2500 + race + mother.smokes +
uterine.irr, data = birthwt)
Residuals:
Min 1Q Median 3Q Max
-1039.37 -338.51 30.49 298.19 1489.19
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3500.81 58.71 59.624 < 2e-16 ***
birthwt.below.2500 -1131.17 71.33 -15.857 < 2e-16 ***
raceBlack -206.36 97.38 -2.119 0.035434 *
raceOther -189.89 74.74 -2.541 0.011898 *
mother.smokesSmoker -157.30 70.29 -2.238 0.026442 *
uterine.irr -309.28 90.04 -3.435 0.000734 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 432 on 183 degrees of freedom
Multiple R-squared: 0.6584, Adjusted R-squared: 0.6491
F-statistic: 70.55 on 5 and 183 DF, p-value: < 2.2e-16
par(cex = 2)
plot(wise, which = c(1, 2), sub = "")
par(cex = 2)
plot(wise, which = c(3, 4), sub = "")
shapiro.test(wise$residuals)
Shapiro-Wilk normality test
data: wise$residuals
W = 0.98877, p-value = 0.142
library(lmtest)
dwtest(wise)
Durbin-Watson test
data: wise
DW = 0.417, p-value < 2.2e-16
alternative hypothesis: true autocorrelation is greater than 0
library(car)
outlierTest(wise)
No Studentized residuals with Bonferonni p < 0.05
Largest |rstudent|:
rstudent unadjusted p-value Bonferonni p
226 3.590985 0.00042366 0.080071
(pred <- predict(wise, newdata = birthwt[c(3, 6, 7), ]))
87 91 92
3343.513 3310.928 3500.814
(real <- birthwt[c(3, 6, 7), "birthwt.grams"])
[1] 2557 2622 2637
(mean(pred - real))
[1] 779.7516
predict(wise, newdata = birthwt[c(3, 6, 7), ], interval = "prediction")
fit lwr upr
87 3343.513 2482.964 4204.061
91 3310.928 2450.411 4171.445
92 3500.814 2640.684 4360.945
y ~ u * v
y ~ (u + v)^2
y ~ u + v + u:v
y ~ u + v + w + u:v + u:w
y ~ u + v + w + u(v + w)
y ~ u * (v + w)
y ~ u * v * w
y ~ poly(x, 3, raw = TRUE)
y ~ x + I(x^2) + I(x^3)
y ~ x + 0
library(lars)
data("diabetes") # 糖尿病患者的血液化验指标数据集
# y: 因变量; x: 标准化的自变量矩阵;
# x2: 自变量矩阵, 由x及一些交互作用组成
names(diabetes)
[1] "x" "y" "x2"
list(length(diabetes$y), dim(diabetes$x), dim(diabetes$x2))
[[1]]
[1] 442
[[2]]
[1] 442 10
[[3]]
[1] 442 64
y <- diabetes$y
x <- diabetes$x
x2 <- diabetes$x2
kappa(x2)
[1] 11427.09
library(car)
y.x2 <- as.data.frame(cbind(y, x2))
v <- vif(lm(y ~ ., y.x2))
sort(v, decreasing = T)[1:5]
tc ldl hdl ltg `tc:ltg`
1295001.21 1000312.11 180836.83 139965.06 61177.87
library(MASS)
a <- lm.ridge(y ~ ., lambda = seq(0, 150, length = 151),
data = y.x2, model = TRUE)
names(a)
[1] "coef" "scales" "Inter" "lambda" "ym" "xm" "GCV" "kHKB"
[9] "kLW"
a$lambda[which.min(a$GCV)]
[1] 81
a$coef[which.min(a$GCV)]
[1] 17.12666
par(cex = 2)
matplot(a$lambda, t(a$coef), xlab = expression(lambda), ylab = "Coefficients", type = "l", lty = 1:20)
abline(v = a$lambda[which.min(a$GCV)])
par(cex = 2)
matplot(a$lambda, t(a$coef), xlab = expression(lambda), ylab = "Coefficients", type = "l", lty = 1:20, ylim = c(-30, 30))
abline(v = a$lambda[which.min(a$GCV)])
par(cex = 2)
plot(a$lambda, a$GCV, xlab = expression(lambda), ylab = expression(beta), type = "l")
abline(v = a$lambda[which.min(a$GCV)])
par(cex = 2)
matplot(a$lambda, t(a$coef), xlab = expression(lambda), ylab = "Coefficients", type = "l", lty = 1:20, ylim = c(-30, 30))
abline(v = a$lambda[which.min(a$GCV)])
laa <- lars(x2, y)
par(cex = 2)
plot(laa)
dim(summary(laa))
[1] 105 3
dim(summary(laa))
[1] 105 3
min(laa$Cp)
[1] 18.19822
# Cp在第15步时取得最小值
summary(laa)[13:20, ]
LARS/LASSO
Call: lars(x = x2, y = y)
Df Rss Cp
12 13 1249726 25.058
13 14 1234993 21.858
14 15 1225552 20.526
15 16 1213289 18.198
16 17 1212253 19.833
17 18 1210149 21.090
18 19 1206003 21.627
19 20 1204605 23.134
par(cex = 2)
cva <- cv.lars(x2, y, K = 10)
(best <- cva$index[which.min(cva$cv)])
[1] 0.03030303
coef <- coef.lars(laa, mode = "fraction", s = best)
coef[coef!=0]
sex bmi map hdl ltg glu
-92.966618 502.356055 241.631884 -174.935230 465.398774 10.828814
bmi^2 glu^2 age:sex age:map age:ltg age:glu
33.918974 62.355697 97.307919 28.427380 4.497064 13.779861
bmi:map
79.712186
coef1 <- coef.lars(laa, mode = "step", s = 15)
coef1[coef1!=0]
sex bmi map hdl ltg glu
-112.312541 501.616811 251.792232 -187.827872 467.778725 18.130314
age^2 bmi^2 glu^2 age:sex age:map age:ltg
7.608734 38.748381 69.809696 107.727556 30.045318 8.613394
age:glu bmi:map
11.600284 85.688057
c(length(coef[coef!=0]), length(coef1[coef1!=0]))
[1] 13 14
vertebral.two <- read.table("./data/vertebral_column_data/column_2C.dat")
head(vertebral.two, 2)
V1 V2 V3 V4 V5 V6 V7
1 63.03 22.55 39.61 40.48 98.67 -0.25 AB
2 39.06 10.06 25.02 29.00 114.41 4.56 AB
c(mean(vertebral.two[, "V6"]), vertebral.two[116, "V6"])
[1] 26.29674 418.54000
runin <- lm(V6 ~ ., vertebral.two[-116, ])
vertebral.two[116, 6] <- predict(runin, vertebral.two[116, -6])
c(mean(vertebral.two[, "V6"]), vertebral.two[116, "V6"])
[1] 25.11047 50.79539
library(MASS)
model.lda <- lda(V7 ~ ., data = vertebral.two)
result.pred <- predict(model.lda, vertebral.two)$class
(confusion <- table(vertebral.two[, 7], result.pred))
result.pred
AB NO
AB 188 22
NO 23 77
(error.rate <- (sum(confusion)-sum(diag(confusion)))/sum(confusion))
[1] 0.1451613
model.logistic <- glm(V7 ~ ., data = vertebral.two, family = "binomial")
model.logistic <- step(model.logistic, trace = 0)
is.NO <- (predict(model.logistic, vertebral.two, type = "response") > 0.5)
result.pred <- rep("NO", nrow(vertebral.two))
result.pred[!is.NO] <- "AB"
(confusion <- table(vertebral.two[, 7], result.pred))
result.pred
AB NO
AB 186 24
NO 22 78
(error.rate <- (sum(confusion)-sum(diag(confusion)))/sum(confusion))
[1] 0.1483871
summary(model.logistic)
Call:
glm(formula = V7 ~ V1 + V2 + V3 + V5 + V6, family = "binomial",
data = vertebral.two)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.19329 -0.37733 -0.02971 0.40452 2.69386
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -14.74316 3.20375 -4.602 4.19e-06 ***
V1 0.08323 0.02416 3.445 0.000571 ***
V2 -0.16219 0.03586 -4.523 6.10e-06 ***
V3 0.02729 0.01953 1.397 0.162357
V5 0.10493 0.02271 4.619 3.85e-06 ***
V6 -0.17023 0.02336 -7.288 3.15e-13 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 389.86 on 309 degrees of freedom
Residual deviance: 178.98 on 304 degrees of freedom
AIC: 190.98
Number of Fisher Scoring iterations: 7
library(knitr)
kable(summary(model.logistic)$coef, digits = 4)
Estimate | Std. Error | z value | Pr(>|z|) | |
---|---|---|---|---|
(Intercept) | -14.7432 | 3.2037 | -4.6018 | 0.0000 |
V1 | 0.0832 | 0.0242 | 3.4452 | 0.0006 |
V2 | -0.1622 | 0.0359 | -4.5231 | 0.0000 |
V3 | 0.0273 | 0.0195 | 1.3972 | 0.1624 |
V5 | 0.1049 | 0.0227 | 4.6193 | 0.0000 |
V6 | -0.1702 | 0.0234 | -7.2879 | 0.0000 |
\[ \begin{equation}\large \ln(\frac{p}{1-p}) = 14.7432 + 0.0832 V1 - 0.1622 V2 + 0.0273 V3 \\+ 0.1049 V5 - 0.1702 V6 \end{equation} \]
Thank you