회귀모형의 적합결과 및 분산분석
> setwd("~/Documents/Rwork/KNOU/2017.2학기/R을 이용한 다변량 분석")
> hald.data = read.table('hald.txt',header=T)
> hald.data
y x1 x2 x3 x4
1 78.5 7 26 6 60
2 74.3 1 29 15 52
3 104.3 11 56 8 20
4 87.6 11 31 8 47
5 95.9 7 52 6 33
6 109.2 11 55 9 22
7 102.7 3 71 17 6
8 72.5 1 31 22 44
9 93.1 2 54 18 22
10 115.9 21 47 4 26
11 83.8 1 40 23 34
12 113.3 11 66 9 12
13 109.4 10 68 8 12
> hald.ls = lm(y~.,data=hald.data)
> hald.ls
Call:
lm(formula = y ~ ., data = hald.data)
Coefficients:
(Intercept) x1 x2 x3 x4
62.4054 1.5511 0.5102 0.1019 -0.1441
> summary(hald.ls)
Call:
lm(formula = y ~ ., data = hald.data)
Residuals:
Min 1Q Median 3Q Max
-3.1750 -1.6709 0.2508 1.3783 3.9254
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 62.4054 70.0710 0.891 0.3991
x1 1.5511 0.7448 2.083 0.0708 .
x2 0.5102 0.7238 0.705 0.5009
x3 0.1019 0.7547 0.135 0.8959
x4 -0.1441 0.7091 -0.203 0.8441
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.446 on 8 degrees of freedom
Multiple R-squared: 0.9824, Adjusted R-squared: 0.9736
F-statistic: 111.5 on 4 and 8 DF, p-value: 4.756e-07
> anova(hald.ls)
Analysis of Variance Table
Response: y
Df Sum Sq Mean Sq F value Pr(>F)
x1 1 1450.08 1450.08 242.3679 2.888e-07 ***
x2 1 1207.78 1207.78 201.8705 5.863e-07 ***
x3 1 9.79 9.79 1.6370 0.2366
x4 1 0.25 0.25 0.0413 0.8441
Residuals 8 47.86 5.98
---
데이터 생성
> a = c(1:10)
> a
[1] 1 2 3 4 5 6 7 8 9 10
> a = c(20:10)
> a
[1] 20 19 18 17 16 15 14 13 12 11 10
> a %/% 3
[1] 6 6 6 5 5 5 4 4 4 3 3
> a %% 3
[1] 2 1 0 2 1 0 2 1 0 2 1
> a = c(1:3) == c(3:1)
> a
[1] FALSE TRUE FALSE
SEQ 함수
> x = -5:5
> x
[1] -5 -4 -3 -2 -1 0 1 2 3 4 5
> x[x<0]
[1] -5 -4 -3 -2 -1
> mean(x)
[1] 0
> x[x<0]=0
> mean(x)
[1] 1.363636
> consec=(1:10)*10
> consec
[1] 10 20 30 40 50 60 70 80 90 100
> consec[2]
[1] 20
> consec[c(3,6,8,10)]
[1] 30 60 80 100
> consec[c(-2,-4)]
[1] 10 30 50 60 70 80 90 100
> x=c(10:1)
> consec<60
[1] TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
> x[consec<60] #첫번째, 두번째, 세번째 값
[1] 10 9 8 7 6
> seq(-pi,pi,1) #파이값을 1씩 증가
[1] -3.1415927 -2.1415927 -1.1415927 -0.1415927 0.8584073 1.8584073 2.8584073
> seq(-pi,pi,length=10) #파이값을 10개로 나눔
[1] -3.1415927 -2.4434610 -1.7453293 -1.0471976 -0.3490659 0.3490659 1.0471976 1.7453293
[9] 2.4434610 3.1415927
난수생성
> rnorm(10,-5,2.5) #평균이 -5, 표준편차가 2.5인 정규분포를 따르는 난수 10개
[1] -5.66696754 -2.73338407 -3.51628322 -5.24746144 -7.56506891 -4.47006154 -6.19141094 -9.51129455
[9] -0.08720922 -8.05256413
> rnorm(20) #평균이 0, 표준편차가 1인 정규분포를 따르는 난수 20개
[1] -0.43086982 -0.40359489 1.93710961 -0.84071208 -0.10802247 -0.01138187 0.92168668 -2.05161187
[9] -0.12887195 -0.27202527 1.73687933 -1.09407623 0.46089730 1.74135668 0.69786046 -0.38488478
[17] 1.05465527 -1.22405759 0.18905127 1.26340571
행렬연산
> x=c(1:12)
> x=matrix(x,ncol=4,byrow=T)
> x
[,1] [,2] [,3] [,4]
[1,] 1 2 3 4
[2,] 5 6 7 8
[3,] 9 10 11 12
> y=matrix(x,ncol=4,byrow=F)
> y
[,1] [,2] [,3] [,4]
[1,] 1 2 3 4
[2,] 5 6 7 8
[3,] 9 10 11 12
> y=c(1:12)
> y=matrix(y,ncol=4,byrow=F)
> y
[,1] [,2] [,3] [,4]
[1,] 1 4 7 10
[2,] 2 5 8 11
[3,] 3 6 9 12
> x
[,1] [,2] [,3] [,4]
[1,] 1 2 3 4
[2,] 5 6 7 8
[3,] 9 10 11 12
> x[,c(1:3)]
[,1] [,2] [,3]
[1,] 1 2 3
[2,] 5 6 7
[3,] 9 10 11
> matrix(1,nrow=4,byrow=T)
[,1]
[1,] 1
[2,] 1
[3,] 1
[4,] 1
> matrix(1,nrow=4,ncol=4)
[,1] [,2] [,3] [,4]
[1,] 1 1 1 1
[2,] 1 1 1 1
[3,] 1 1 1 1
[4,] 1 1 1 1
> x[c(1:3),]
[,1] [,2] [,3] [,4]
[1,] 1 2 3 4
[2,] 5 6 7 8
[3,] 9 10 11 12
> x[c(1:3),-2]
[,1] [,2] [,3]
[1,] 1 3 4
[2,] 5 7 8
[3,] 9 11 12
> a=c(1:3,5:7)
> a
[1] 1 2 3 5 6 7
> x=matrix(a,ncol=3,byrow=T)
> x
[,1] [,2] [,3]
[1,] 1 2 3
[2,] 5 6 7
> y=matrix(c(1:4),ncol=2,byrow=F)
> y
[,1] [,2]
[1,] 1 3
[2,] 2 4
> t(x)%*%y
[,1] [,2]
[1,] 11 23
[2,] 14 30
[3,] 17 37
apply함수
> apply(x,1,mean) #1은 행
[1] 2 6
> apply(x,2,mean) #2는 열
[1] 3 4 5
Function 문
> squareF=function(x){x*x}
> View(squreF)
> squareF(5)
[1] 25
기술통계량 구하기
> survey.data=read.table('survey.txt',header=T)
> survey.data
seq sex marriage age job edu salary
1 1 1 1 21 1 4 60
2 2 1 1 22 5 5 100
3 3 1 1 33 1 4 200
4 4 2 2 33 7 4 200
5 5 1 2 28 1 4 70
6 6 1 1 21 5 5 80
7 7 2 2 39 7 4 190
8 8 1 1 32 1 4 100
9 9 1 2 44 3 1 120
10 10 1 2 55 4 4 110
11 11 2 2 46 7 5 150
12 12 1 1 20 1 4 50
13 13 1 2 31 6 4 210
14 14 1 1 27 1 4 60
15 15 2 1 21 5 5 80
> str(survey.data)
'data.frame': 15 obs. of 7 variables:
$ seq : int 1 2 3 4 5 6 7 8 9 10 ...
$ sex : int 1 1 1 2 1 1 2 1 1 1 ...
$ marriage: int 1 1 1 2 2 1 2 1 2 2 ...
$ age : int 21 22 33 33 28 21 39 32 44 55 ...
$ job : int 1 5 1 7 1 5 7 1 3 4 ...
$ edu : int 4 5 4 4 4 5 4 4 1 4 ...
$ salary : int 60 100 200 200 70 80 190 100 120 110 ...
> survey.data$sex = as.factor(survey.data$sex)
> survey.data$marriage = as.factor(survey.data$marriage)
> survey.data$job = as.factor(survey.data$job)
> survey.data$edu = as.factor(survey.data$edu)
> str(survey.data)
'data.frame': 15 obs. of 7 variables:
$ seq : int 1 2 3 4 5 6 7 8 9 10 ...
$ sex : Factor w/ 2 levels "1","2": 1 1 1 2 1 1 2 1 1 1 ...
$ marriage: Factor w/ 2 levels "1","2": 1 1 1 2 2 1 2 1 2 2 ...
$ age : int 21 22 33 33 28 21 39 32 44 55 ...
$ job : Factor w/ 6 levels "1","3","4","5",..: 1 4 1 6 1 4 6 1 2 3 ...
$ edu : Factor w/ 3 levels "1","4","5": 2 3 2 2 2 3 2 2 1 2 ...
$ salary : int 60 100 200 200 70 80 190 100 120 110 ...
> attach(survey.data)
> mean(age)
[1] 31.53333
> sd(age)
[1] 10.57535
> summary(age)
Min. 1st Qu. Median Mean 3rd Qu. Max.
20.00 21.50 31.00 31.53 36.00 55.00
> tapply(age,sex,mean)
1 2
30.36364 34.75000
> tapply(age,sex,sd)
1 2
10.82841 10.59481
> tapply(age,marriage,mean)
1 2
24.62500 39.42857
> tapply(age,marriage,sd)
1 2
5.316752 9.571784
> table(sex,marriage)
marriage
sex 1 2
1 7 4
2 1 3
> sex.marriage=list(sex,marriage)
> table(sex.marriage)
sex.marriage.2
sex.marriage.1 1 2
1 7 4
2 1 3
> sex.edu=list(survey.data$sex,survey.data$edu)
> sex.edu.tb=table(sex.edu)
> sex.edu.tb
sex.edu.2
sex.edu.1 1 2
3 4 5
1 1
1 1 13 11
2 0
0 2 6 5
> rownames(sex.edu.tb)=c('남성','여성')
> colnames(sex.edu.tb)=c('무학','초졸','중졸','고졸','대졸')
> barplot(sex.edu.tb)
> par(mfrow=c(1,2))
> pie(sex.edu.tb[1,])
> title('Education of Male')
> pie(sex.edu.tb[2,])
> title('Education of Female')
#산점도 사례
> math <-c(68,64,48,46,78,60,90,50,66,70)
> physics<-c(70,68,46,48,84,64,92,52,68,72)
> cor(math,physics) #두 변수의 상관계수 값
[1] 0.9921232
> plot(math,physics,main='수학과 물리 산점도')
##설문조사 사례##
문항 1. 귀하의 성별은?
1)
남자
2) 여자
문항
2.
결혼하셨습니까?
1)
미혼
2) 기혼
3) 이혼
문항
3.
귀하의
나이는? (단위: 세)
4.
귀하의
직업은?
1)
회사원 2) 공무원 3) 노무자
4) 정치가 5) 학생 6) 기업가
7) 주부
8) 기타
문항
5.
귀하의
학력은?
1)
중졸이하 2)고졸 3) 대졸 4)
대학원 5) 그 이상
6.
가족의
월수입은? (단위: 만원)
> survey.data =
read.table('survey.csv',sep=',',header=T)
> head(survey.data)
seq sex marriage
age job edu salary
1 1 1
1 21 1
4 60
2 2 1
1 22 5
5 100
3 3 1
1 33 1
4 200
4 4 2
2 33 7
4 120
5 5 1
2 28 1
4 70
6 6 1
1 21 5
5 80
> attach(survey.data)
> mean(age)
[1] 34.275
> sd(age)
[1] 11.60236
> summary(age)
Min. 1st
Qu. Median Mean 3rd Qu. Max.
20.00 24.75
32.00 34.28 42.50
59.00
#그룹화
> tapply(survey.data$age,survey.data$sex,mean) #성별 나의 평균
1 2
33.96296 34.92308
> tapply(survey.data$age,survey.data$sex,sd)
1 2
11.96945 11.24323
> tapply(survey.data$age,survey.data$marriage,mean) #결혼 별 나이 평균
1 2
3
24.66667 39.13043 50.50000
> tapply(survey.data$age,survey.data$marriage,sd)
1 2 3
4.151879 10.467718
12.020815
>
sex.marriage=list(survey.data$sex,survey.data$marriage) #그룹
> table(sex.marriage) #분할표
sex.marriage.2
sex.marriage.1
1 2 3
1 10
15 2
2 5
8 0
> tapply(survey.data$age,sex.marriage,mean)
1 2
3
1 24.8 37.86667 50.5
2 24.4 41.50000
NA
> tapply(survey.data$age,sex.marriage,sd)
1 2 3
1 4.709329 11.230486 12.02082
2 3.209361
9.071147 NA
> table(survey.data$sex) #빈도표
1 2
27 13
> table(edu)
edu
1 2
3 4 5
1 1 3 19
16
> SexEdu=table(survey.data$sex,survey.data$edu)
> SexEdu
1 2 3 4 5
1 1
1 1 13 11
2 0
0 2 6 5
어느
집단에서
표본을 10명 추출하여 다음과 같은 4개 문항에 대하여 설문조사를 실시하였다.
문항 1. 귀하의 성별은? 1) 남자 2)
여자
문항
2.
귀하의
나이는? (단위 : 세) 문항
3.
교육정도는? 1) 중졸이하 2) 고졸 3)대졸 및 그 이상 문항
4.
월수입(단위: 만원)
> ex8=read.table('ex8.txt',header=T)
> colnames(ex8)<-c('sex','age','edu','salary')
> sex.tb=table(ex8$sex)
> sex.tb
1 2
6 4
> edu.tb=table(ex8$edu)
> edu.tbe
1 2 3
2 4 4
> rownames(edu.tb)=c('중졸이하','고졸이하','대졸이상')
> edu.tb
중졸이하 고졸이하 대졸이상
2 4 4
> barplot(edu.tb)
> plot(ex8$age,ex8$salary)
> title('Scatter plot of (Age,Salary)')
> plot(ex8$age,ex8$salary,type="n")
>
points(ex8$age[sex=='1'],ex8$salary[sex=='1'],pch='M',col='blue')
>
points(ex8$age[sex=='2'],ex8$salary[sex=='2'],pch='F',col='red')
> title("나이, 월수입 산점도:남녀별 구분")
위의 그래프가 내가 그린 것이고, 아래는 교재. 왜 안되지?.. 좀더 공부해보자.
> edt.tb = table(survey.data$edu)
> edt.tb
1 2
3 4 5
1 1 3 19
16
> rownames(edt.tb) = c('무학','초졸','중졸','고졸','대졸')
> barplot(edu.tb)
> title('교육정도 막대그림')
> pie(edu.tb)
> title('교육정도 원그림')
Sex를 그룹변수로 하여 교육(edu)의 막대그림을 그려보도록 하자.
> sex_edu = list(survey.data$sex, survey.data$edu)
> sex_edu.tb = table(sex_edu)
> sex_edu.tb
sex_edu.2
sex_edu.1 1 2
3 4 5
1 1
1 1 13 11
2 0
0 2 6 5
> colnames(sex_edu.tb) = c('무학','초졸','중졸','고졸','대졸')
> rownames(sex_edu.tb) = c('남자','여자')
> barplot(sex_edu.tb)
> title('성별 교육 정도 막대그림')
> boxplot(salary~sex,data=survey.data,xlab='성별',ylab='소득')
R에 내장된 데이터 co2를
이용, 선으로 그려보자(lines(smooth(co2)). col에서 4는 blue를 의미.
> plot(co2)
> lines(smooth(co2),col=4)
> x<-seq(0,20,0.1)
> y<-exp(-x/10)*cos(2*x)
> plot(x,y,type='line') #or type=’l’
> library(MVA)
> library(HSAUR2)
> data(USairpollution)
> head(USairpollution)
SO2
temp manu popul wind precip predays
Albany 46
47.6 44 116
8.8 33.36 135
Albuquerque 11
56.8 46 244
8.9 7.77 58
Atlanta 24
61.5 368 497
9.1 48.34 115
Baltimore 47
55.0 625 905
9.6 41.31 111
Buffalo 11
47.1 391 463 12.4
36.11 166
Charleston 31
55.2 35 71
6.5 40.75 148
> plot(USairpollution$manu,USairpollution$popul)
> x=USairpollution[,c(3,4)]
> bvbox(x,xlab='manu',ylab='popul')
> title('bivariate boxplot') # bivariate 이변량
> identify(x)
>
rownames(x)[c(7,9,14,30)]
[1]
"Chicago"
"Cleveland"
"Detroit"
"Philadelphia"
> plot(wind~temp,data=USairpollution,pch=9)
>
with(USairpollution,symbols(temp,wind,circles=SO2,inches=0.5,add=T))
> title('Bubble Plot')
등고선 그림(contour)과 이미지 그림(image plot)
기상자료, 지리 위치 등을 나타내는 자료는 좌표상에 z변수를 등고선이나, 이미지 등으로 표시하게 되면 매우 효율적임
- 등고선 자료:
자료를 나타내기 위하 두 좌표의 격자(grad)위에 등고선으로 면을
그려줌
- 이미지 그림:
색깔을 이용하여 z변수의 이미지를 표현한 그래프
> x<--6:16
> x
[1] -6 -5 -4 -3 -2
-1 0
1 2 3
4 5 6
7 8 9 10 11 12 13 14 15 16
> par(mfrow=c(2,2))
> contour(outer(x,x),method='edge',vfont=c('sans
serif','plain'))
> z<-outer(x,sqrt(abs(x)),FUN='/')
> image(x,x,z)
>
contour(x,x,z,col='pink',add=TRUE,method='edge',vfont=c('sans serif','plain'))
> contour(x,x,z,ylim=c(1,6),method='simple',laboex=1)
Warning messages:
1: In plot.window(xlim, ylim, ...) : "laboex"
is not a graphical parameter
2: In title(...) : "laboex" is not a graphical
parameter
3: In axis(side = side, at = at, labels = labels, ...) :
"laboex" is not a graphical parameter
4: In axis(side = side, at = at, labels = labels, ...) :
"laboex" is not a graphical parameter
5: In box(...) : "laboex" is not a graphical
parameter
>
contour(x,x,z,ylim=c(-6,6),nlev=20,lty=2,method='simple')
겨냥도 그림(Perspective
Plot)
등간격의 격자상에 높이 값을 갖는 행렬 자료에 대한 3차원 표현 방법 높이는 선으로 연결
> par(mfrow=c(1,2))
> x<-seq(-10,10,length=30)
> y<-x
> f<-function(x,y){r<-sqrt(x^2+y^2);10*sin(r)/r}
> z<-outer(x,y,f)
> z[is.na(z)]<-l
Error: object 'l' not found
> z[is.na(z)]<-1
> persp(x,y,z,theta=30,phi=30,expand=0.5,col='lightblue')
>
persp(x,y,z,theta=30,phi=30,expand=0.5,col='lightblue',ltheta=120,shade=0.75,thicktype='detailed',xlab='X',ylab='Y',zlab='Sinc(r)')
Warning message:
In persp.default(x, y, z, theta = 30, phi = 30, expand =
0.5, col = "lightblue", :
"thicktype" is not a graphical parameter
> edu<-read.csv('education.csv')
> edu
grade y1970
y1990
1 초등졸 23.8
21.1
2 중졸 24.7 20.2
3 고졸 36.0 43.6
4 대졸 11.0 15.1
> name=edu[,1]
> grade70=edu[,2]
> grade90=edu[,3]
> par(mfrow=c(1,2))
> pie(grade70,label=name,clockwise=T,col=c(2:5),main='1970년도 여성학력')
> mtext(outer=F,'(a)',side=1,line=-6)
>
pie(grade90,label=name,clockwise=T,col=c(2:5),main='1990년도 여성학력')
> mtext(outer=F,'(b)',side=1,line=-6)
연습문제
1. 13개 시중은행에 대한 편리성,
신속성, 친절, 능률, 쾌적, 자동화등의 점수. 출처: 유종렬 외 3인, S-Plus를
이용한 통계계산, 박영사, 1997
> bank<-read.csv('bank.csv',header=T)
> head(bank)
X
convenience accuracy kindness efficiency pleasant automatic
1 kookmin 70.5
59.4 63.7 54.3
66.9 62.6
2 enterpr 64.8
70.3 68.6 55.2
68.0 64.1
3 boram 67.1
79.6 78.5 62.4
79.8 62.4
4 commerce
61.1 65.0 65.6
54.4 64.5 63.9
5 seoul 63.4
66.5 67.9 65.0
59.7 62.0
6 shinhan 72.3
69.1 74.2 60.0
70.1 68.2
> bank<-bank[,-1]
>
name<-c('kookmin','enterpr','boram','commerce','seoul','shinhan','city','exchange','first','chohung','hana','hanil','house')
> head(bank)
convenience
accuracy kindness efficiency pleasant automatic
1 70.5 59.4
63.7 54.3 66.9
62.6
2 64.8 70.3
68.6 55.2 68.0
64.1
3 67.1 79.6
78.5 62.4 79.8
62.4
4 61.1 65.0
65.6 54.4 64.5
63.9
5 63.4 66.5
67.9 65.0 59.7
62.0
6 72.3 69.1
74.2 60.0 70.1
68.2
(1) 각 변수들의 히스토그램을 그리고 설명
> library(dplyr)
> nrow(bank)
[1] 13
> bank$convenience %>% summary
Min. 1st
Qu. Median Mean 3rd Qu. Max.
61.10 63.50
64.80 65.98 68.40
72.30
> bank$convenience %>% mean
[1] 65.97692
> bank$convenience %>% sd
[1] 3.279521
>
bank$convenience %>% hist
> con.h<-bank %>% filter(convenience >
mean(bank$convenience))
> con.l<-bank %>% filter(convenience <
mean(bank$convenience))
> con.h
convenience
accuracy kindness efficiency pleasant automatic
1 70.5 59.4
63.7 54.3 66.9
62.6
2 67.1 79.6
78.5 62.4 79.8
62.4
3 72.3 69.1
74.2 60.0 70.1
68.2
4 68.4 67.5
67.3 51.3 71.3
65.8
5 66.1 66.5
67.3 50.7 63.4
63.3
6 69.0 74.3
80.5 63.6 75.7
55.9
> con.h$convenience %>% summary
Min. 1st
Qu. Median Mean 3rd Qu. Max.
66.10 67.42
68.70 68.90 70.12
72.30
> con.l$convenience %>% summary
Min. 1st
Qu. Median Mean 3rd Qu. Max.
61.10 63.30
63.50 63.47 64.15
64.80
> con.h %>% summarise('평균 친절도' = mean(kindness))
평균 친절도
1 71.91667
> con.l %>% summarise('평균 친절도'=mean(kindness))
평균 친절도
1 67.7
2) 산점도 행렬
> plot(bank)
3) 별그림과 얼굴그림
> rownames(bank) = name
> head(bank)
convenience accuracy kindness efficiency pleasant automatic
kookmin
70.5 59.4 63.7
54.3 66.9 62.6
enterpr
64.8 70.3 68.6
55.2 68.0 64.1
boram
67.1 79.6 78.5
62.4 79.8 62.4
commerce
61.1 65.0 65.6
54.4 64.5 63.9
seoul
63.4 66.5 67.9
65.0 59.7 62.0
shinhan
72.3 69.1 74.2
60.0 70.1 68.2
> stars(bank)
> library(aplpack)
Loading required package: tcltk
> faces(bank,face.type=1)
effect of variables:
modified item Var
"height of
face " "convenience"
"width of
face " "accuracy"
"structure of
face" "kindness"
"height of
mouth " "efficiency"
"width of
mouth " "pleasant"
"smiling " "automatic"
"height of
eyes " "convenience"
"width of
eyes " "accuracy"
"height of
hair " "kindness"
"width of
hair " "efficiency"
"style of
hair " "pleasant"
"height of
nose " "automatic"
"width of
nose " "convenience"
"width of
ear " "accuracy"
"height of
ear " "kindness"
출처: R을 이용한 데이터 마이닝(김성수, 김현중, 정성석, 이용구 공저)