一言不合就学R

商务大数据智能分析之 R4

Lecturer : 申旌周
jingzhou_shen@stu.xjtu.edu.cn

Instructor : 常象宇
xiangyuchang@xjtu.edu.cn

2017年4月9日

概览

Linear Model for Regression
Linear Model for Classification

Linear Model for Regression

Linear Regression
Stepwise Regression
Ridge
LASSO

Linear Regression

Data
Simple Linear Regression
Multiple Linear Regression

Data

library(MASS)

str(birthwt)

'data.frame':   189 obs. of  10 variables:
 $ low  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ age  : int  19 33 20 21 18 21 22 17 29 26 ...
 $ lwt  : int  182 155 105 108 107 124 118 103 123 113 ...
 $ race : int  2 3 1 1 1 3 1 3 1 1 ...
 $ smoke: int  0 0 1 1 1 0 0 0 1 1 ...
 $ ptl  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ ht   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ ui   : int  1 0 0 1 1 0 0 0 0 0 ...
 $ ftv  : int  0 3 1 2 0 0 1 1 1 0 ...
 $ bwt  : int  2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...

Data

head(birthwt)

   low age lwt race smoke ptl ht ui ftv  bwt
85   0  19 182    2     0   0  0  1   0 2523
86   0  33 155    3     0   0  0  0   3 2551
87   0  20 105    1     1   0  0  0   1 2557
88   0  21 108    1     1   0  0  1   2 2594
89   0  18 107    1     1   0  0  1   0 2600
91   0  21 124    3     0   0  0  0   0 2622

colnames(birthwt) <- 
  c("birthwt.below.2500", 
    "mother.age", "mother.weight",  
    "race", "mother.smokes", "previous.prem.labor",
    "hypertension", "uterine.irr", "physician.visits", 
    "birthwt.grams")

Data

birthwt$race <- as.factor(birthwt$race)

(attr(birthwt$race, "levels") <- c("White", "Black", "Other"))

[1] "White" "Black" "Other"

birthwt$mother.smokes <- as.factor(birthwt$mother.smokes)

(attr(birthwt$mother.smokes, "levels") <- c("Nonsmoker", "Smoker"))

[1] "Nonsmoker" "Smoker"

Simple Linear Regression

lm(birthwt.grams ~ mother.age, data = birthwt)


Call:
lm(formula = birthwt.grams ~ mother.age, data = birthwt)

Coefficients:
(Intercept)   mother.age  
    2655.74        12.43

Simple Linear Regression

library(ggplot2)

# birthwt.grams ~ mother.age
p <- ggplot(
  birthwt, 
  aes(x = mother.age, y = birthwt.grams))

plot.simple <- p + 
  geom_point(aes(colour = race), 
             size = 4) + 
  stat_smooth(method = "lm") + 
  theme(text = element_text(size = 22))

plot.simple

plot of chunk unnamed-chunk-12

Simple Linear Regression

slr <- lm(birthwt.grams ~ mother.age, data = birthwt)

attributes(slr)

$names
 [1] "coefficients"  "residuals"     "effects"       "rank"         
 [5] "fitted.values" "assign"        "qr"            "df.residual"  
 [9] "xlevels"       "call"          "terms"         "model"        

$class
[1] "lm"

Simple Linear Regression

summary(slr)


Call:
lm(formula = birthwt.grams ~ mother.age, data = birthwt)

Residuals:
     Min       1Q   Median       3Q      Max 
-2294.78  -517.63    10.51   530.80  1774.92 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  2655.74     238.86   11.12   <2e-16 ***
mother.age     12.43      10.02    1.24    0.216    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 728.2 on 187 degrees of freedom
Multiple R-squared:  0.008157,  Adjusted R-squared:  0.002853 
F-statistic: 1.538 on 1 and 187 DF,  p-value: 0.2165

Multiple Linear Regression

mlr <- lm(birthwt.grams ~ mother.age + race, data = birthwt)

mlr


Call:
lm(formula = birthwt.grams ~ mother.age + race, data = birthwt)

Coefficients:
(Intercept)   mother.age    raceBlack    raceOther  
   2949.979        6.288     -365.715     -285.466

Birthweight = Intercept(based on race) + \( \large\beta \) * mother's age \[ \begin{equation}\large y = \text{Intercept}_{race} + \beta \times \text{age} \end{equation} \]

Multiple Linear Regression

coef(mlr)

(Intercept)  mother.age   raceBlack   raceOther 
2949.979032    6.287741 -365.715000 -285.465780

confint(mlr)

                 2.5 %     97.5 %
(Intercept) 2446.20259 3453.75547
mother.age   -13.58435   26.15984
raceBlack   -682.62810  -48.80190
raceOther   -513.39407  -57.53749

intercepts <- c(
  coef(mlr)["(Intercept)"],
  coef(mlr)["(Intercept)"] + coef(mlr)["raceBlack"],
  coef(mlr)["(Intercept)"] + coef(mlr)["raceOther"])

lines.df <- data.frame(
  intercepts = intercepts, slopes = rep(coef(mlr)["mother.age"], 3),
  race = levels(birthwt$race))

Multiple Linear Regression

plot.mlr <- 
  ggplot(x = mother.age, 
        y = birthwt.grams,
        color = race,
        data = birthwt) + 
  geom_abline(
    aes(
      intercept = intercepts, 
      slope = slopes, 
      color = race), 
    data = lines.df) + 
  theme(text = element_text(size = 22))

plot.mlr

plot of chunk unnamed-chunk-20

Multiple Linear Regression

mlr.inter <- lm(birthwt.grams ~ mother.age * race, data = birthwt)

mlr.inter


Call:
lm(formula = birthwt.grams ~ mother.age * race, data = birthwt)

Coefficients:
         (Intercept)            mother.age             raceBlack  
             2583.54                 21.37               1022.79  
           raceOther  mother.age:raceBlack  mother.age:raceOther  
              326.05                -62.54                -26.03

Birthweight = Intercept(based on race) + \( \large\beta \)(based on race) * mother's age
\[ \begin{equation}\large y = \text{Intercept}_{race} + \beta_{race}\times \text{age} \end{equation} \]

Multiple Linear Regression

plot.mlr.inter <- 
  qplot(x = mother.age, 
        y = birthwt.grams, 
        color = race,
        data = birthwt) + 
  stat_smooth(
    method = "lm", 
    se = FALSE, 
    fullrange = TRUE) + 
  theme(text = element_text(size = 22))

plot.mlr.inter

plot of chunk unnamed-chunk-24

Multiple Linear Regression

birthwt.grams ~ mother.age + race

plot.mlr

plot of chunk unnamed-chunk-25

\[ \begin{equation}\large y = \text{Intercept}_{race} + \beta \times \text{age} \end{equation} \]

birthwt.grams ~ mother.age * race

plot.mlr.inter

plot of chunk unnamed-chunk-26

\[ \begin{equation}\large y = \text{Intercept}_{race} + \beta_{race}\times \text{age} \end{equation} \]

Multiple Linear Regression

summary(mlr)


Call:
lm(formula = birthwt.grams ~ mother.age + race, data = birthwt)

Residuals:
     Min       1Q   Median       3Q      Max 
-2131.57  -488.02    -1.16   521.87  1757.07 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 2949.979    255.352  11.553   <2e-16 ***
mother.age     6.288     10.073   0.624   0.5332    
raceBlack   -365.715    160.636  -2.277   0.0240 *  
raceOther   -285.466    115.531  -2.471   0.0144 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 715.7 on 185 degrees of freedom
Multiple R-squared:  0.05217,   Adjusted R-squared:  0.0368 
F-statistic: 3.394 on 3 and 185 DF,  p-value: 0.01909

Multiple Linear Regression

summary(mlr.inter)


Call:
lm(formula = birthwt.grams ~ mother.age * race, data = birthwt)

Residuals:
     Min       1Q   Median       3Q      Max 
-2182.35  -474.23    13.48   523.86  1496.51 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)           2583.54     321.52   8.035 1.11e-13 ***
mother.age              21.37      12.89   1.658   0.0991 .  
raceBlack             1022.79     694.21   1.473   0.1424    
raceOther              326.05     545.30   0.598   0.5506    
mother.age:raceBlack   -62.54      30.67  -2.039   0.0429 *  
mother.age:raceOther   -26.03      23.20  -1.122   0.2633    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 710.7 on 183 degrees of freedom
Multiple R-squared:  0.07541,   Adjusted R-squared:  0.05015 
F-statistic: 2.985 on 5 and 183 DF,  p-value: 0.01291

Stepwise Regression

lm.full <- lm(birthwt.grams ~ ., data = birthwt)

bwd <- step(lm.full, direction = "backward", trace = 0)

lm.null <- lm(birthwt.grams ~ 1, data = birthwt)

fwd <- step(lm.null, direction = "forward", scope = ( ~ birthwt.below.2500 + mother.age + mother.weight + race + mother.smokes + previous.prem.labor + hypertension + uterine.irr + physician.visits), trace = 0)

Stepwise Regression

summary(bwd)


Call:
lm(formula = birthwt.grams ~ birthwt.below.2500 + race + mother.smokes + 
    uterine.irr, data = birthwt)

Residuals:
     Min       1Q   Median       3Q      Max 
-1039.37  -338.51    30.49   298.19  1489.19 

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)          3500.81      58.71  59.624  < 2e-16 ***
birthwt.below.2500  -1131.17      71.33 -15.857  < 2e-16 ***
raceBlack            -206.36      97.38  -2.119 0.035434 *  
raceOther            -189.89      74.74  -2.541 0.011898 *  
mother.smokesSmoker  -157.30      70.29  -2.238 0.026442 *  
uterine.irr          -309.28      90.04  -3.435 0.000734 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 432 on 183 degrees of freedom
Multiple R-squared:  0.6584,    Adjusted R-squared:  0.6491 
F-statistic: 70.55 on 5 and 183 DF,  p-value: < 2.2e-16

Stepwise Regression

summary(fwd)


Call:
lm(formula = birthwt.grams ~ birthwt.below.2500 + uterine.irr + 
    race + mother.smokes, data = birthwt)

Residuals:
     Min       1Q   Median       3Q      Max 
-1039.37  -338.51    30.49   298.19  1489.19 

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)          3500.81      58.71  59.624  < 2e-16 ***
birthwt.below.2500  -1131.17      71.33 -15.857  < 2e-16 ***
uterine.irr          -309.28      90.04  -3.435 0.000734 ***
raceBlack            -206.36      97.38  -2.119 0.035434 *  
raceOther            -189.89      74.74  -2.541 0.011898 *  
mother.smokesSmoker  -157.30      70.29  -2.238 0.026442 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 432 on 183 degrees of freedom
Multiple R-squared:  0.6584,    Adjusted R-squared:  0.6491 
F-statistic: 70.55 on 5 and 183 DF,  p-value: < 2.2e-16

Stepwise Regression

wise <- step(lm.full, trace = 0);  summary(wise)


Call:
lm(formula = birthwt.grams ~ birthwt.below.2500 + race + mother.smokes + 
    uterine.irr, data = birthwt)

Residuals:
     Min       1Q   Median       3Q      Max 
-1039.37  -338.51    30.49   298.19  1489.19 

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)          3500.81      58.71  59.624  < 2e-16 ***
birthwt.below.2500  -1131.17      71.33 -15.857  < 2e-16 ***
raceBlack            -206.36      97.38  -2.119 0.035434 *  
raceOther            -189.89      74.74  -2.541 0.011898 *  
mother.smokesSmoker  -157.30      70.29  -2.238 0.026442 *  
uterine.irr          -309.28      90.04  -3.435 0.000734 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 432 on 183 degrees of freedom
Multiple R-squared:  0.6584,    Adjusted R-squared:  0.6491 
F-statistic: 70.55 on 5 and 183 DF,  p-value: < 2.2e-16

回归诊断

par(cex = 2)
plot(wise, which = c(1, 2), sub = "")

plot of chunk unnamed-chunk-36

回归诊断

par(cex = 2)
plot(wise, which = c(3, 4), sub = "")

plot of chunk unnamed-chunk-37

回归诊断

残差正态性检验

shapiro.test(wise$residuals)


    Shapiro-Wilk normality test

data:  wise$residuals
W = 0.98877, p-value = 0.142

残差自相关检验

library(lmtest)
dwtest(wise)


    Durbin-Watson test

data:  wise
DW = 0.417, p-value < 2.2e-16
alternative hypothesis: true autocorrelation is greater than 0

离群点检测

library(car)
outlierTest(wise)


No Studentized residuals with Bonferonni p < 0.05
Largest |rstudent|:
    rstudent unadjusted p-value Bonferonni p
226 3.590985         0.00042366     0.080071

预测

(pred <- predict(wise, newdata = birthwt[c(3, 6, 7), ]))

      87       91       92 
3343.513 3310.928 3500.814

(real <- birthwt[c(3, 6, 7), "birthwt.grams"])

[1] 2557 2622 2637

(mean(pred - real))

[1] 779.7516

predict(wise, newdata = birthwt[c(3, 6, 7), ], interval = "prediction")

        fit      lwr      upr
87 3343.513 2482.964 4204.061
91 3310.928 2450.411 4171.445
92 3500.814 2640.684 4360.945

交乘项

\[ \begin{equation}\large y_{i} = \beta_{0} + \beta_{1}u_{i} + \beta_{2}v_{i} + \beta_{3}u_{i}v_{i} + \varepsilon_{i} \end{equation} \]

y ~ u * v
y ~ (u + v)^2
y ~ u + v + u:v

\[ \begin{equation}\large y_{i} = \beta_{0} + \beta_{1}u_{i} + \beta_{2}v_{i} + \beta_{3}w_{i} + \beta_{4}u_{i}v_{i} + \beta_{5}u_{i}w_{i} + \varepsilon_{i} \end{equation} \]

y ~ u + v + w + u:v + u:w
y ~ u + v + w + u(v + w)

y ~ u * (v + w)

y ~ u * v * w

多项式

多项式 \[ \begin{equation}\large y_{i} = \beta_{0} + \beta_{1}x_{i} + \beta_{2}x_{i}^{2} + \beta_{3}x_{i}^{3} + \varepsilon_{i} \end{equation} \]

y ~ poly(x, 3, raw = TRUE)
y ~ x + I(x^2) + I(x^3)

截距项为零 \[ \begin{equation}\large y_{i} = \beta x_{i} + \varepsilon_{i} \end{equation} \]

y ~ x + 0

Data - Ridge and LASSO

library(lars)
data("diabetes") # 糖尿病患者的血液化验指标数据集

#  y: 因变量； x: 标准化的自变量矩阵； 
#  x2: 自变量矩阵, 由x及一些交互作用组成
names(diabetes)

[1] "x"  "y"  "x2"

list(length(diabetes$y), dim(diabetes$x), dim(diabetes$x2))

[[1]]
[1] 442

[[2]]
[1] 442  10

[[3]]
[1] 442  64

y <- diabetes$y
x <- diabetes$x
x2 <- diabetes$x2

Ridge

kappa(x2)

[1] 11427.09

library(car)
y.x2 <- as.data.frame(cbind(y, x2))
v <- vif(lm(y ~ ., y.x2))
sort(v, decreasing = T)[1:5]

        tc        ldl        hdl        ltg   `tc:ltg` 
1295001.21 1000312.11  180836.83  139965.06   61177.87

Ridge

library(MASS)
a <- lm.ridge(y ~ ., lambda = seq(0, 150, length = 151),
              data = y.x2, model = TRUE)
names(a)

[1] "coef"   "scales" "Inter"  "lambda" "ym"     "xm"     "GCV"    "kHKB"  
[9] "kLW"

a$lambda[which.min(a$GCV)]

[1] 81

a$coef[which.min(a$GCV)]

[1] 17.12666

Ridge

par(cex = 2)
matplot(a$lambda, t(a$coef), xlab = expression(lambda), ylab = "Coefficients", type = "l", lty = 1:20)
abline(v = a$lambda[which.min(a$GCV)])

plot of chunk unnamed-chunk-52

par(cex = 2)
matplot(a$lambda, t(a$coef), xlab = expression(lambda), ylab = "Coefficients", type = "l", lty = 1:20, ylim = c(-30, 30))
abline(v = a$lambda[which.min(a$GCV)])

plot of chunk unnamed-chunk-53

Ridge

par(cex = 2)
plot(a$lambda, a$GCV, xlab = expression(lambda), ylab = expression(beta), type = "l")
abline(v = a$lambda[which.min(a$GCV)])

plot of chunk unnamed-chunk-54

par(cex = 2)
matplot(a$lambda, t(a$coef), xlab = expression(lambda), ylab = "Coefficients", type = "l", lty = 1:20, ylim = c(-30, 30))
abline(v = a$lambda[which.min(a$GCV)])

plot of chunk unnamed-chunk-55

LASSO

laa <- lars(x2, y)
par(cex = 2)
plot(laa)

plot of chunk unnamed-chunk-56

dim(summary(laa))

[1] 105   3

dim(summary(laa))

[1] 105   3

min(laa$Cp)

[1] 18.19822

# Cp在第15步时取得最小值
summary(laa)[13:20, ]

LARS/LASSO
Call: lars(x = x2, y = y)
   Df     Rss     Cp
12 13 1249726 25.058
13 14 1234993 21.858
14 15 1225552 20.526
15 16 1213289 18.198
16 17 1212253 19.833
17 18 1210149 21.090
18 19 1206003 21.627
19 20 1204605 23.134

LASSO

par(cex = 2)
cva <- cv.lars(x2, y, K = 10)

plot of chunk unnamed-chunk-59

(best <- cva$index[which.min(cva$cv)])

[1] 0.03030303

LASSO

coef <- coef.lars(laa, mode = "fraction", s = best)
coef[coef!=0]

        sex         bmi         map         hdl         ltg         glu 
 -92.966618  502.356055  241.631884 -174.935230  465.398774   10.828814 
      bmi^2       glu^2     age:sex     age:map     age:ltg     age:glu 
  33.918974   62.355697   97.307919   28.427380    4.497064   13.779861 
    bmi:map 
  79.712186

coef1 <- coef.lars(laa, mode = "step", s = 15)
coef1[coef1!=0]

        sex         bmi         map         hdl         ltg         glu 
-112.312541  501.616811  251.792232 -187.827872  467.778725   18.130314 
      age^2       bmi^2       glu^2     age:sex     age:map     age:ltg 
   7.608734   38.748381   69.809696  107.727556   30.045318    8.613394 
    age:glu     bmi:map 
  11.600284   85.688057

c(length(coef[coef!=0]), length(coef1[coef1!=0]))

[1] 13 14

Linear Model for Classification

Data
Linear Discriminant Analysis
Logistic Regression

Data

脊柱数据

vertebral.two <- read.table("./data/vertebral_column_data/column_2C.dat")
head(vertebral.two, 2)

     V1    V2    V3    V4     V5    V6 V7
1 63.03 22.55 39.61 40.48  98.67 -0.25 AB
2 39.06 10.06 25.02 29.00 114.41  4.56 AB

c(mean(vertebral.two[, "V6"]), vertebral.two[116, "V6"])

[1]  26.29674 418.54000

runin <- lm(V6 ~ ., vertebral.two[-116, ])
vertebral.two[116, 6] <- predict(runin, vertebral.two[116, -6])

c(mean(vertebral.two[, "V6"]), vertebral.two[116, "V6"])

[1] 25.11047 50.79539

Linear Discriminant Analysis

library(MASS)

model.lda <- lda(V7 ~ ., data = vertebral.two)

result.pred <- predict(model.lda, vertebral.two)$class

(confusion <- table(vertebral.two[, 7], result.pred))

    result.pred
      AB  NO
  AB 188  22
  NO  23  77

(error.rate <- (sum(confusion)-sum(diag(confusion)))/sum(confusion))

[1] 0.1451613

Logistic Regression

model.logistic <- glm(V7 ~ ., data = vertebral.two, family = "binomial")

model.logistic <- step(model.logistic, trace = 0)

is.NO <- (predict(model.logistic, vertebral.two, type = "response") > 0.5)

result.pred <- rep("NO", nrow(vertebral.two))
result.pred[!is.NO] <- "AB"

(confusion <- table(vertebral.two[, 7], result.pred))

    result.pred
      AB  NO
  AB 186  24
  NO  22  78

(error.rate <- (sum(confusion)-sum(diag(confusion)))/sum(confusion))

[1] 0.1483871

summary(model.logistic)


Call:
glm(formula = V7 ~ V1 + V2 + V3 + V5 + V6, family = "binomial", 
    data = vertebral.two)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.19329  -0.37733  -0.02971   0.40452   2.69386  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept) -14.74316    3.20375  -4.602 4.19e-06 ***
V1            0.08323    0.02416   3.445 0.000571 ***
V2           -0.16219    0.03586  -4.523 6.10e-06 ***
V3            0.02729    0.01953   1.397 0.162357    
V5            0.10493    0.02271   4.619 3.85e-06 ***
V6           -0.17023    0.02336  -7.288 3.15e-13 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 389.86  on 309  degrees of freedom
Residual deviance: 178.98  on 304  degrees of freedom
AIC: 190.98

Number of Fisher Scoring iterations: 7

Logistic Regression

library(knitr)
kable(summary(model.logistic)$coef, digits = 4)

	Estimate	Std. Error	z value	Pr(>\|z\|)
(Intercept)	-14.7432	3.2037	-4.6018	0.0000
V1	0.0832	0.0242	3.4452	0.0006
V2	-0.1622	0.0359	-4.5231	0.0000
V3	0.0273	0.0195	1.3972	0.1624
V5	0.1049	0.0227	4.6193	0.0000
V6	-0.1702	0.0234	-7.2879	0.0000

\[ \begin{equation}\large \ln(\frac{p}{1-p}) = 14.7432 + 0.0832 V1 - 0.1622 V2 + 0.0273 V3 \\+ 0.1049 V5 - 0.1702 V6 \end{equation} \]

Thank you