线性回归模型(Linear Model for Regression)
线性分类模型(Linear Model for Classification)
library(MASS)
str(birthwt)
## 'data.frame': 189 obs. of 10 variables:
## $ low : int 0 0 0 0 0 0 0 0 0 0 ...
## $ age : int 19 33 20 21 18 21 22 17 29 26 ...
## $ lwt : int 182 155 105 108 107 124 118 103 123 113 ...
## $ race : int 2 3 1 1 1 3 1 3 1 1 ...
## $ smoke: int 0 0 1 1 1 0 0 0 1 1 ...
## $ ptl : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ht : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ui : int 1 0 0 1 1 0 0 0 0 0 ...
## $ ftv : int 0 3 1 2 0 0 1 1 1 0 ...
## $ bwt : int 2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
head(birthwt)
## low age lwt race smoke ptl ht ui ftv bwt
## 85 0 19 182 2 0 0 0 1 0 2523
## 86 0 33 155 3 0 0 0 0 3 2551
## 87 0 20 105 1 1 0 0 0 1 2557
## 88 0 21 108 1 1 0 0 1 2 2594
## 89 0 18 107 1 1 0 0 1 0 2600
## 91 0 21 124 3 0 0 0 0 0 2622
colnames(birthwt) <-
c("birthwt.below.2500",
"mother.age", "mother.weight",
"race", "mother.smokes", "previous.prem.labor",
"hypertension", "uterine.irr", "physician.visits",
"birthwt.grams")
birthwt$race <- as.factor(birthwt$race)
(attr(birthwt$race, "levels") <- c("White", "Black", "Other"))
## [1] "White" "Black" "Other"
birthwt$mother.smokes <- as.factor(birthwt$mother.smokes)
(attr(birthwt$mother.smokes, "levels") <- c("Nonsmoker", "Smoker"))
## [1] "Nonsmoker" "Smoker"
lm(birthwt.grams ~ mother.age, data = birthwt)
##
## Call:
## lm(formula = birthwt.grams ~ mother.age, data = birthwt)
##
## Coefficients:
## (Intercept) mother.age
## 2655.74 12.43
library(ggplot2)
# birthwt.grams ~ mother.age
p <- ggplot(
birthwt,
aes(x = mother.age, y = birthwt.grams))
plot.simple <- p +
geom_point(aes(colour = race),
size = 4) +
stat_smooth(method = "lm") +
theme(text = element_text(size = 22))
plot.simple
slr <- lm(birthwt.grams ~ mother.age, data = birthwt)
attributes(slr)
## $names
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
##
## $class
## [1] "lm"
summary(slr)
##
## Call:
## lm(formula = birthwt.grams ~ mother.age, data = birthwt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2294.78 -517.63 10.51 530.80 1774.92
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2655.74 238.86 11.12 <2e-16 ***
## mother.age 12.43 10.02 1.24 0.216
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 728.2 on 187 degrees of freedom
## Multiple R-squared: 0.008157, Adjusted R-squared: 0.002853
## F-statistic: 1.538 on 1 and 187 DF, p-value: 0.2165
Multiple Linear Regression
mlr <- lm(birthwt.grams ~ mother.age + race, data = birthwt)
mlr
##
## Call:
## lm(formula = birthwt.grams ~ mother.age + race, data = birthwt)
##
## Coefficients:
## (Intercept) mother.age raceBlack raceOther
## 2949.979 6.288 -365.715 -285.466
coef(mlr)
## (Intercept) mother.age raceBlack raceOther
## 2949.979032 6.287741 -365.715000 -285.465780
confint(mlr)
## 2.5 % 97.5 %
## (Intercept) 2446.20259 3453.75547
## mother.age -13.58435 26.15984
## raceBlack -682.62810 -48.80190
## raceOther -513.39407 -57.53749
mlr.inter <- lm(birthwt.grams ~ mother.age * race, data = birthwt)
mlr.inter
##
## Call:
## lm(formula = birthwt.grams ~ mother.age * race, data = birthwt)
##
## Coefficients:
## (Intercept) mother.age raceBlack
## 2583.54 21.37 1022.79
## raceOther mother.age:raceBlack mother.age:raceOther
## 326.05 -62.54 -26.03
birthwt.grams ~ mother.age + race
summary(mlr)
##
## Call:
## lm(formula = birthwt.grams ~ mother.age + race, data = birthwt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2131.57 -488.02 -1.16 521.87 1757.07
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2949.979 255.352 11.553 <2e-16 ***
## mother.age 6.288 10.073 0.624 0.5332
## raceBlack -365.715 160.636 -2.277 0.0240 *
## raceOther -285.466 115.531 -2.471 0.0144 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 715.7 on 185 degrees of freedom
## Multiple R-squared: 0.05217, Adjusted R-squared: 0.0368
## F-statistic: 3.394 on 3 and 185 DF, p-value: 0.01909
summary(mlr.inter)
##
## Call:
## lm(formula = birthwt.grams ~ mother.age * race, data = birthwt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2182.35 -474.23 13.48 523.86 1496.51
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2583.54 321.52 8.035 1.11e-13 ***
## mother.age 21.37 12.89 1.658 0.0991 .
## raceBlack 1022.79 694.21 1.473 0.1424
## raceOther 326.05 545.30 0.598 0.5506
## mother.age:raceBlack -62.54 30.67 -2.039 0.0429 *
## mother.age:raceOther -26.03 23.20 -1.122 0.2633
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 710.7 on 183 degrees of freedom
## Multiple R-squared: 0.07541, Adjusted R-squared: 0.05015
## F-statistic: 2.985 on 5 and 183 DF, p-value: 0.01291
命令 | 描述 |
---|---|
coef |
给出模型系数 |
confint |
置信区间 |
anova |
方差分析 |
deviance |
残差平方和 |
effect |
正交影响向量 |
fitted |
拟合结果 |
resid |
给出模型残差 |
vcov |
主要参数的协方差矩阵 |
lm.full <- lm(birthwt.grams ~ ., data = birthwt)
bwd <- step(lm.full, direction = "backward")
## Start: AIC=2304.73
## birthwt.grams ~ birthwt.below.2500 + mother.age + mother.weight +
## race + mother.smokes + previous.prem.labor + hypertension +
## uterine.irr + physician.visits
##
## Df Sum of Sq RSS AIC
## - physician.visits 1 9552 33263661 2302.8
## - previous.prem.labor 1 263368 33517477 2304.2
## - mother.age 1 270851 33524960 2304.3
## - mother.weight 1 308633 33562742 2304.5
## - hypertension 1 332675 33586785 2304.6
## <none> 33254109 2304.7
## - mother.smokes 1 1088898 34343008 2308.8
## - race 2 1706722 34960832 2310.2
## - uterine.irr 1 2509469 35763578 2316.5
## - birthwt.below.2500 1 42448208 75702317 2458.2
##
## Step: AIC=2302.79
## birthwt.grams ~ birthwt.below.2500 + mother.age + mother.weight +
## race + mother.smokes + previous.prem.labor + hypertension +
## uterine.irr
##
## Df Sum of Sq RSS AIC
## - previous.prem.labor 1 267027 33530689 2302.3
## - mother.age 1 299213 33562875 2302.5
## - mother.weight 1 300805 33564466 2302.5
## - hypertension 1 324794 33588455 2302.6
## <none> 33263661 2302.8
## - mother.smokes 1 1084502 34348163 2306.8
## - race 2 1699490 34963151 2308.2
## - uterine.irr 1 2502617 35766278 2314.5
## - birthwt.below.2500 1 42477363 75741025 2456.3
##
## Step: AIC=2302.3
## birthwt.grams ~ birthwt.below.2500 + mother.age + mother.weight +
## race + mother.smokes + hypertension + uterine.irr
##
## Df Sum of Sq RSS AIC
## - mother.age 1 231737 33762425 2301.6
## - mother.weight 1 251768 33782456 2301.7
## - hypertension 1 319904 33850593 2302.1
## <none> 33530689 2302.3
## - mother.smokes 1 939840 34470529 2305.5
## - race 2 1653607 35184295 2307.4
## - uterine.irr 1 2282970 35813659 2312.8
## - birthwt.below.2500 1 42301896 75832585 2454.5
##
## Step: AIC=2301.6
## birthwt.grams ~ birthwt.below.2500 + mother.weight + race + mother.smokes +
## hypertension + uterine.irr
##
## Df Sum of Sq RSS AIC
## - mother.weight 1 180447 33942872 2300.6
## - hypertension 1 297763 34060188 2301.3
## <none> 33762425 2301.6
## - mother.smokes 1 873670 34636095 2304.4
## - race 2 1467913 35230338 2305.6
## - uterine.irr 1 2228338 35990763 2311.7
## - birthwt.below.2500 1 42175080 75937505 2452.8
##
## Step: AIC=2300.61
## birthwt.grams ~ birthwt.below.2500 + race + mother.smokes + hypertension +
## uterine.irr
##
## Df Sum of Sq RSS AIC
## - hypertension 1 205506 34148378 2299.8
## <none> 33942872 2300.6
## - mother.smokes 1 939594 34882466 2303.8
## - race 2 1511671 35454543 2304.8
## - uterine.irr 1 2344367 36287239 2311.2
## - birthwt.below.2500 1 44668862 78611734 2457.3
##
## Step: AIC=2299.75
## birthwt.grams ~ birthwt.below.2500 + race + mother.smokes + uterine.irr
##
## Df Sum of Sq RSS AIC
## <none> 34148378 2299.8
## - mother.smokes 1 934437 35082815 2302.8
## - race 2 1543569 35691947 2304.1
## - uterine.irr 1 2201541 36349919 2309.6
## - birthwt.below.2500 1 46921303 81069681 2461.2
lm.null <- lm(birthwt.grams ~ 1, data = birthwt)
fwd <- step(lm.null, direction = "forward", scope = ( ~ birthwt.below.2500 + mother.age + mother.weight ))
## Start: AIC=2492.76
## birthwt.grams ~ 1
##
## Df Sum of Sq RSS AIC
## + birthwt.below.2500 1 61573224 38396432 2313.9
## + mother.weight 1 3448639 96521017 2488.1
## <none> 99969656 2492.8
## + mother.age 1 815483 99154173 2493.2
##
## Step: AIC=2313.91
## birthwt.grams ~ birthwt.below.2500
##
## Df Sum of Sq RSS AIC
## <none> 38396432 2313.9
## + mother.weight 1 284886 38111546 2314.5
## + mother.age 1 929 38395503 2315.9
summary(bwd)
##
## Call:
## lm(formula = birthwt.grams ~ birthwt.below.2500 + race + mother.smokes +
## uterine.irr, data = birthwt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1039.37 -338.51 30.49 298.19 1489.19
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3500.81 58.71 59.624 < 2e-16 ***
## birthwt.below.2500 -1131.17 71.33 -15.857 < 2e-16 ***
## raceBlack -206.36 97.38 -2.119 0.035434 *
## raceOther -189.89 74.74 -2.541 0.011898 *
## mother.smokesSmoker -157.30 70.29 -2.238 0.026442 *
## uterine.irr -309.28 90.04 -3.435 0.000734 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 432 on 183 degrees of freedom
## Multiple R-squared: 0.6584, Adjusted R-squared: 0.6491
## F-statistic: 70.55 on 5 and 183 DF, p-value: < 2.2e-16
summary(fwd)
##
## Call:
## lm(formula = birthwt.grams ~ birthwt.below.2500 + uterine.irr +
## race + mother.smokes, data = birthwt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1039.37 -338.51 30.49 298.19 1489.19
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3500.81 58.71 59.624 < 2e-16 ***
## birthwt.below.2500 -1131.17 71.33 -15.857 < 2e-16 ***
## uterine.irr -309.28 90.04 -3.435 0.000734 ***
## raceBlack -206.36 97.38 -2.119 0.035434 *
## raceOther -189.89 74.74 -2.541 0.011898 *
## mother.smokesSmoker -157.30 70.29 -2.238 0.026442 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 432 on 183 degrees of freedom
## Multiple R-squared: 0.6584, Adjusted R-squared: 0.6491
## F-statistic: 70.55 on 5 and 183 DF, p-value: < 2.2e-16
wise <- step(lm.full, trace = 0); summary(wise)
##
## Call:
## lm(formula = birthwt.grams ~ birthwt.below.2500 + race + mother.smokes +
## uterine.irr, data = birthwt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1039.37 -338.51 30.49 298.19 1489.19
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3500.81 58.71 59.624 < 2e-16 ***
## birthwt.below.2500 -1131.17 71.33 -15.857 < 2e-16 ***
## raceBlack -206.36 97.38 -2.119 0.035434 *
## raceOther -189.89 74.74 -2.541 0.011898 *
## mother.smokesSmoker -157.30 70.29 -2.238 0.026442 *
## uterine.irr -309.28 90.04 -3.435 0.000734 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 432 on 183 degrees of freedom
## Multiple R-squared: 0.6584, Adjusted R-squared: 0.6491
## F-statistic: 70.55 on 5 and 183 DF, p-value: < 2.2e-16
library(lars)
data("diabetes") # 糖尿病患者的血液化验指标数据集
# y: 因变量; x: 标准化的自变量矩阵;
# x2: 自变量矩阵, 由x及一些交互作用组成
names(diabetes)
## [1] "x" "y" "x2"
list(length(diabetes$y), dim(diabetes$x), dim(diabetes$x2))
## [[1]]
## [1] 442
##
## [[2]]
## [1] 442 10
##
## [[3]]
## [1] 442 64
y <- diabetes$y
x <- diabetes$x
x2 <- diabetes$x2
kappa(x2)
## [1] 11427.09
library(car)
y.x2 <- as.data.frame(cbind(y, x2))
v <- vif(lm(y ~ ., y.x2))
sort(v, decreasing = T)[1:5]
## tc ldl hdl ltg `tc:ltg`
## 1295001.21 1000312.11 180836.83 139965.06 61177.87
library(MASS)
a <- lm.ridge(y ~ ., lambda = seq(0, 150, length = 151),
data = y.x2, model = TRUE)
names(a)
## [1] "coef" "scales" "Inter" "lambda" "ym" "xm" "GCV" "kHKB"
## [9] "kLW"
a$lambda[which.min(a$GCV)]
## [1] 81
a$coef[which.min(a$GCV)]
## [1] 17.12666
matplot(a$lambda, t(a$coef), xlab = expression(lambda), ylab = "Coefficients", type = "l", lty = 1:20)
abline(v = a$lambda[which.min(a$GCV)])
#par(cex = 2)
matplot(a$lambda, t(a$coef), xlab = expression(lambda), ylab = "Coefficients", type = "l", lty = 1:20, ylim = c(-30, 30))
abline(v = a$lambda[which.min(a$GCV)])
plot(a$lambda, a$GCV, xlab = expression(lambda), ylab = expression(beta), type = "l")
abline(v = a$lambda[which.min(a$GCV)])
matplot(a$lambda, t(a$coef), xlab = expression(lambda), ylab = "Coefficients", type = "l", lty = 1:20, ylim = c(-30, 30))
abline(v = a$lambda[which.min(a$GCV)])
laa <- lars(x2, y)
plot(laa)
dim(summary(laa))
## [1] 105 3
dim(summary(laa))
## [1] 105 3
min(laa$Cp)
## [1] 18.19822
# Cp在第15步时取得最小值
summary(laa)[13:20, ]
## LARS/LASSO
## Call: lars(x = x2, y = y)
## Df Rss Cp
## 12 13 1249726 25.058
## 13 14 1234993 21.858
## 14 15 1225552 20.526
## 15 16 1213289 18.198
## 16 17 1212253 19.833
## 17 18 1210149 21.090
## 18 19 1206003 21.627
## 19 20 1204605 23.134
cva <- cv.lars(x2, y, K = 10)
(best <- cva$index[which.min(cva$cv)])
## [1] 0.03030303
coef <- coef.lars(laa, mode = "fraction", s = best)
coef[coef!=0]
## sex bmi map hdl ltg glu
## -92.966618 502.356055 241.631884 -174.935230 465.398774 10.828814
## bmi^2 glu^2 age:sex age:map age:ltg age:glu
## 33.918974 62.355697 97.307919 28.427380 4.497064 13.779861
## bmi:map
## 79.712186
coef1 <- coef.lars(laa, mode = "step", s = 15)
coef1[coef1!=0]
## sex bmi map hdl ltg glu
## -112.312541 501.616811 251.792232 -187.827872 467.778725 18.130314
## age^2 bmi^2 glu^2 age:sex age:map age:ltg
## 7.608734 38.748381 69.809696 107.727556 30.045318 8.613394
## age:glu bmi:map
## 11.600284 85.688057
c(length(coef[coef!=0]), length(coef1[coef1!=0]))
## [1] 13 14
回归诊断
残差正态性检验
shapiro.test(wise$residuals)
##
## Shapiro-Wilk normality test
##
## data: wise$residuals
## W = 0.98877, p-value = 0.142
library(lmtest)
dwtest(wise)
##
## Durbin-Watson test
##
## data: wise
## DW = 0.417, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
library(car)
outlierTest(wise)
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferonni p
## 226 3.590985 0.00042366 0.080071
(pred <- predict(wise, newdata = birthwt[c(3, 6, 7), ]))
## 87 91 92
## 3343.513 3310.928 3500.814
(real <- birthwt[c(3, 6, 7), "birthwt.grams"])
## [1] 2557 2622 2637
(mean(pred - real))
## [1] 779.7516
predict(wise, newdata = birthwt[c(3, 6, 7), ], interval = "prediction")
## fit lwr upr
## 87 3343.513 2482.964 4204.061
## 91 3310.928 2450.411 4171.445
## 92 3500.814 2640.684 4360.945
y ~ u * v
y ~ (u + v)^2
y ~ u + v + u:v
y ~ u + v + w + u:v + u:w
y ~ u + v + w + u(v + w)
y ~ u * (v + w)
y ~ u * v * w
y ~ poly(x, 3, raw = TRUE)
y ~ x + I(x^2) + I(x^3)
y ~ x + 0
Data
vertebral.two <- read.table("column_2C.dat")
head(vertebral.two, 2)
## V1 V2 V3 V4 V5 V6 V7
## 1 63.03 22.55 39.61 40.48 98.67 -0.25 AB
## 2 39.06 10.06 25.02 29.00 114.41 4.56 AB
c(mean(vertebral.two[, "V6"]), vertebral.two[116, "V6"])
## [1] 26.29674 418.54000
runin <- lm(V6 ~ ., vertebral.two[-116, ])
vertebral.two[116, -6]
## V1 V2 V3 V4 V5 V7
## 116 129.83 8.4 48.38 121.43 107.69 AB
vertebral.two[116, 6] <- predict(runin, vertebral.two[116, -6])
c(mean(vertebral.two[, "V6"]), vertebral.two[116, "V6"])
## [1] 25.11047 50.79539
Linear Discriminant Analysis
library(MASS)
model.lda <- lda(V7 ~ ., data = vertebral.two)
result.pred <- predict(model.lda, vertebral.two)$class
(confusion <- table(vertebral.two[, 7], result.pred))
## result.pred
## AB NO
## AB 188 22
## NO 23 77
(error.rate <- (sum(confusion)-sum(diag(confusion)))/sum(confusion))
## [1] 0.1451613
Logistic Regression
model.logistic <- glm(V7 ~ ., data = vertebral.two, family = "binomial")
model.logistic <- step(model.logistic, trace = 0)
is.NO <- (predict(model.logistic, vertebral.two, type = "response") > 0.5)
result.pred <- rep("NO", nrow(vertebral.two))
result.pred[!is.NO] <- "AB"
(confusion <- table(vertebral.two[, 7], result.pred))
## result.pred
## AB NO
## AB 186 24
## NO 22 78
(error.rate <- (sum(confusion)-sum(diag(confusion)))/sum(confusion))
## [1] 0.1483871
summary(model.logistic)
##
## Call:
## glm(formula = V7 ~ V1 + V2 + V3 + V5 + V6, family = "binomial",
## data = vertebral.two)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.19329 -0.37733 -0.02971 0.40452 2.69386
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -14.74316 3.20375 -4.602 4.19e-06 ***
## V1 0.08323 0.02416 3.445 0.000571 ***
## V2 -0.16219 0.03586 -4.523 6.10e-06 ***
## V3 0.02729 0.01953 1.397 0.162357
## V5 0.10493 0.02271 4.619 3.85e-06 ***
## V6 -0.17023 0.02336 -7.288 3.15e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 389.86 on 309 degrees of freedom
## Residual deviance: 178.98 on 304 degrees of freedom
## AIC: 190.98
##
## Number of Fisher Scoring iterations: 7
#library(knitr)
kable(summary(model.logistic)$coef, digits = 4)
Estimate | Std. Error | z value | Pr(>|z|) | |
---|---|---|---|---|
(Intercept) | -14.7432 | 3.2037 | -4.6018 | 0.0000 |
V1 | 0.0832 | 0.0242 | 3.4452 | 0.0006 |
V2 | -0.1622 | 0.0359 | -4.5231 | 0.0000 |
V3 | 0.0273 | 0.0195 | 1.3972 | 0.1624 |
V5 | 0.1049 | 0.0227 | 4.6193 | 0.0000 |
V6 | -0.1702 | 0.0234 | -7.2879 | 0.0000 |