제1장 다변량 데이터 시각화

KNOU/3 R을 이용한 다변량분석 2017. 2. 25. 16:47

회귀모형의 적합결과 및 분산분석

> setwd("~/Documents/Rwork/KNOU/2017.2학기/R을 이용한 다변량 분석")

> hald.data = read.table('hald.txt',header=T)

> hald.data

y x1 x2 x3 x4

1 78.5 7 26 6 60

2 74.3 1 29 15 52

3 104.3 11 56 8 20

4 87.6 11 31 8 47

5 95.9 7 52 6 33

6 109.2 11 55 9 22

7 102.7 3 71 17 6

8 72.5 1 31 22 44

9 93.1 2 54 18 22

10 115.9 21 47 4 26

11 83.8 1 40 23 34

12 113.3 11 66 9 12

13 109.4 10 68 8 12

> hald.ls = lm(y~.,data=hald.data)

> hald.ls

Call:

lm(formula = y ~ ., data = hald.data)

Coefficients:

(Intercept) x1 x2 x3 x4

62.4054 1.5511 0.5102 0.1019 -0.1441

> summary(hald.ls)

Call:

lm(formula = y ~ ., data = hald.data)

Residuals:

Min 1Q Median 3Q Max

-3.1750 -1.6709 0.2508 1.3783 3.9254

Coefficients:

Estimate Std. Error t value Pr(>|t|)

(Intercept) 62.4054 70.0710 0.891 0.3991

x1 1.5511 0.7448 2.083 0.0708 .

x2 0.5102 0.7238 0.705 0.5009

x3 0.1019 0.7547 0.135 0.8959

x4 -0.1441 0.7091 -0.203 0.8441

---

Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.446 on 8 degrees of freedom

Multiple R-squared: 0.9824, Adjusted R-squared: 0.9736

F-statistic: 111.5 on 4 and 8 DF, p-value: 4.756e-07

> anova(hald.ls)

Analysis of Variance Table

Response: y

Df Sum Sq Mean Sq F value Pr(>F)

x1 1 1450.08 1450.08 242.3679 2.888e-07 ***

x2 1 1207.78 1207.78 201.8705 5.863e-07 ***

x3 1 9.79 9.79 1.6370 0.2366

x4 1 0.25 0.25 0.0413 0.8441

Residuals 8 47.86 5.98

---

데이터 생성

> a = c(1:10)

> a

[1] 1 2 3 4 5 6 7 8 9 10

> a = c(20:10)

> a

[1] 20 19 18 17 16 15 14 13 12 11 10

> a %/% 3

[1] 6 6 6 5 5 5 4 4 4 3 3

> a %% 3

[1] 2 1 0 2 1 0 2 1 0 2 1

> a = c(1:3) == c(3:1)

> a

[1] FALSE TRUE FALSE

SEQ 함수

> x = -5:5

> x

[1] -5 -4 -3 -2 -1 0 1 2 3 4 5

> x[x<0]

[1] -5 -4 -3 -2 -1

> mean(x)

[1] 0

> x[x<0]=0

> mean(x)

[1] 1.363636

> consec=(1:10)*10

> consec

[1] 10 20 30 40 50 60 70 80 90 100

> consec[2]

[1] 20

> consec[c(3,6,8,10)]

[1] 30 60 80 100

> consec[c(-2,-4)]

[1] 10 30 50 60 70 80 90 100

> x=c(10:1)

> consec<60

[1] TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE

> x[consec<60] #첫번째, 두번째, 세번째 값

[1] 10 9 8 7 6

> seq(-pi,pi,1) #파이값을 1씩 증가

[1] -3.1415927 -2.1415927 -1.1415927 -0.1415927 0.8584073 1.8584073 2.8584073

> seq(-pi,pi,length=10) #파이값을 10개로 나눔

[1] -3.1415927 -2.4434610 -1.7453293 -1.0471976 -0.3490659 0.3490659 1.0471976 1.7453293

[9] 2.4434610 3.1415927

난수생성

> rnorm(10,-5,2.5) #평균이 -5, 표준편차가 2.5인 정규분포를 따르는 난수 10개

[1] -5.66696754 -2.73338407 -3.51628322 -5.24746144 -7.56506891 -4.47006154 -6.19141094 -9.51129455

[9] -0.08720922 -8.05256413

> rnorm(20) #평균이 0, 표준편차가 1인 정규분포를 따르는 난수 20개

[1] -0.43086982 -0.40359489 1.93710961 -0.84071208 -0.10802247 -0.01138187 0.92168668 -2.05161187

[9] -0.12887195 -0.27202527 1.73687933 -1.09407623 0.46089730 1.74135668 0.69786046 -0.38488478

[17] 1.05465527 -1.22405759 0.18905127 1.26340571

행렬연산

> x=c(1:12)

> x=matrix(x,ncol=4,byrow=T)

> x

[,1] [,2] [,3] [,4]

[1,] 1 2 3 4

[2,] 5 6 7 8

[3,] 9 10 11 12

> y=matrix(x,ncol=4,byrow=F)

> y

[,1] [,2] [,3] [,4]

[1,] 1 2 3 4

[2,] 5 6 7 8

[3,] 9 10 11 12

> y=c(1:12)

> y=matrix(y,ncol=4,byrow=F)

> y

[,1] [,2] [,3] [,4]

[1,] 1 4 7 10

[2,] 2 5 8 11

[3,] 3 6 9 12

> x

[,1] [,2] [,3] [,4]

[1,] 1 2 3 4

[2,] 5 6 7 8

[3,] 9 10 11 12

> x[,c(1:3)]

[,1] [,2] [,3]

[1,] 1 2 3

[2,] 5 6 7

[3,] 9 10 11

> matrix(1,nrow=4,byrow=T)

[,1]

[1,] 1

[2,] 1

[3,] 1

[4,] 1

> matrix(1,nrow=4,ncol=4)

[,1] [,2] [,3] [,4]

[1,] 1 1 1 1

[2,] 1 1 1 1

[3,] 1 1 1 1

[4,] 1 1 1 1

> x[c(1:3),]

[,1] [,2] [,3] [,4]

[1,] 1 2 3 4

[2,] 5 6 7 8

[3,] 9 10 11 12

> x[c(1:3),-2]

[,1] [,2] [,3]

[1,] 1 3 4

[2,] 5 7 8

[3,] 9 11 12

> a=c(1:3,5:7)

> a

[1] 1 2 3 5 6 7

> x=matrix(a,ncol=3,byrow=T)

> x

[,1] [,2] [,3]

[1,] 1 2 3

[2,] 5 6 7

> y=matrix(c(1:4),ncol=2,byrow=F)

> y

[,1] [,2]

[1,] 1 3

[2,] 2 4

> t(x)%*%y

[,1] [,2]

[1,] 11 23

[2,] 14 30

[3,] 17 37

apply함수

> apply(x,1,mean) #1은 행

[1] 2 6

> apply(x,2,mean) #2는 열

[1] 3 4 5

Function 문

> squareF=function(x){x*x}

> View(squreF)

> squareF(5)

[1] 25

기술통계량 구하기

> survey.data=read.table('survey.txt',header=T)

> survey.data

seq sex marriage age job edu salary

1 1 1 1 21 1 4 60

2 2 1 1 22 5 5 100

3 3 1 1 33 1 4 200

4 4 2 2 33 7 4 200

5 5 1 2 28 1 4 70

6 6 1 1 21 5 5 80

7 7 2 2 39 7 4 190

8 8 1 1 32 1 4 100

9 9 1 2 44 3 1 120

10 10 1 2 55 4 4 110

11 11 2 2 46 7 5 150

12 12 1 1 20 1 4 50

13 13 1 2 31 6 4 210

14 14 1 1 27 1 4 60

15 15 2 1 21 5 5 80

> str(survey.data)

'data.frame': 15 obs. of 7 variables:

$ seq : int 1 2 3 4 5 6 7 8 9 10 ...

$ sex : int 1 1 1 2 1 1 2 1 1 1 ...

$ marriage: int 1 1 1 2 2 1 2 1 2 2 ...

$ age : int 21 22 33 33 28 21 39 32 44 55 ...

$ job : int 1 5 1 7 1 5 7 1 3 4 ...

$ edu : int 4 5 4 4 4 5 4 4 1 4 ...

$ salary : int 60 100 200 200 70 80 190 100 120 110 ...

> survey.data$sex = as.factor(survey.data$sex)

> survey.data$marriage = as.factor(survey.data$marriage)

> survey.data$job = as.factor(survey.data$job)

> survey.data$edu = as.factor(survey.data$edu)

> str(survey.data)

'data.frame': 15 obs. of 7 variables:

$ seq : int 1 2 3 4 5 6 7 8 9 10 ...

$ sex : Factor w/ 2 levels "1","2": 1 1 1 2 1 1 2 1 1 1 ...

$ marriage: Factor w/ 2 levels "1","2": 1 1 1 2 2 1 2 1 2 2 ...

$ age : int 21 22 33 33 28 21 39 32 44 55 ...

$ job : Factor w/ 6 levels "1","3","4","5",..: 1 4 1 6 1 4 6 1 2 3 ...

$ edu : Factor w/ 3 levels "1","4","5": 2 3 2 2 2 3 2 2 1 2 ...

$ salary : int 60 100 200 200 70 80 190 100 120 110 ...

> attach(survey.data)

> mean(age)

[1] 31.53333

> sd(age)

[1] 10.57535

> summary(age)

Min. 1st Qu. Median Mean 3rd Qu. Max.

20.00 21.50 31.00 31.53 36.00 55.00

> tapply(age,sex,mean)

1 2

30.36364 34.75000

> tapply(age,sex,sd)

1 2

10.82841 10.59481

> tapply(age,marriage,mean)

1 2

24.62500 39.42857

> tapply(age,marriage,sd)

1 2

5.316752 9.571784

> table(sex,marriage)

marriage

sex 1 2

1 7 4

2 1 3

> sex.marriage=list(sex,marriage)

> table(sex.marriage)

sex.marriage.2

sex.marriage.1 1 2

1 7 4

2 1 3

> sex.edu=list(survey.data$sex,survey.data$edu)

> sex.edu.tb=table(sex.edu)

> sex.edu.tb

sex.edu.2

sex.edu.1 1 2 3 4 5

1 1 1 1 13 11

2 0 0 2 6 5

> rownames(sex.edu.tb)=c('남성','여성')

> colnames(sex.edu.tb)=c('무학','초졸','중졸','고졸','대졸')

> barplot(sex.edu.tb)

> par(mfrow=c(1,2))

> pie(sex.edu.tb[1,])

> title('Education of Male')

> pie(sex.edu.tb[2,])

> title('Education of Female')

#산점도 사례

> math <-c(68,64,48,46,78,60,90,50,66,70)

> physics<-c(70,68,46,48,84,64,92,52,68,72)

> cor(math,physics) #두 변수의 상관계수 값

[1] 0.9921232

> plot(math,physics,main='수학과 물리 산점도')

##설문조사 사례##

문항 1. 귀하의 성별은?

1) 남자 2) 여자 문항

2. 결혼하셨습니까?

1) 미혼 2) 기혼 3) 이혼 문항

3. 귀하의 나이는? (단위: 세)

4. 귀하의 직업은?

1) 회사원 2) 공무원 3) 노무자 4) 정치가 5) 학생 6) 기업가 7) 주부 8) 기타 문항

5. 귀하의 학력은?

1) 중졸이하 2)고졸 3) 대졸 4) 대학원 5) 그 이상

6. 가족의 월수입은? (단위: 만원)

> survey.data = read.table('survey.csv',sep=',',header=T)

> head(survey.data)

seq sex marriage age job edu salary

1 1 1 1 21 1 4 60

2 2 1 1 22 5 5 100

3 3 1 1 33 1 4 200

4 4 2 2 33 7 4 120

5 5 1 2 28 1 4 70

6 6 1 1 21 5 5 80

> attach(survey.data)

> mean(age)

[1] 34.275

> sd(age)

[1] 11.60236

> summary(age)

Min. 1st Qu. Median Mean 3rd Qu. Max.

20.00 24.75 32.00 34.28 42.50 59.00

#그룹화

> tapply(survey.data$age,survey.data$sex,mean) #성별 나의 평균

1 2

33.96296 34.92308

> tapply(survey.data$age,survey.data$sex,sd)

1 2

11.96945 11.24323

> tapply(survey.data$age,survey.data$marriage,mean) #결혼 별 나이 평균

1 2 3

24.66667 39.13043 50.50000

> tapply(survey.data$age,survey.data$marriage,sd)

1 2 3

4.151879 10.467718 12.020815

> sex.marriage=list(survey.data$sex,survey.data$marriage) #그룹

> table(sex.marriage) #분할표

sex.marriage.2

sex.marriage.1 1 2 3

1 10 15 2

2 5 8 0

> tapply(survey.data$age,sex.marriage,mean)

1 2 3

1 24.8 37.86667 50.5

2 24.4 41.50000 NA

> tapply(survey.data$age,sex.marriage,sd)

1 2 3

1 4.709329 11.230486 12.02082

2 3.209361 9.071147 NA

> table(survey.data$sex) #빈도표

1 2

27 13

> table(edu)

edu

1 2 3 4 5

1 1 3 19 16

> SexEdu=table(survey.data$sex,survey.data$edu)

> SexEdu

1 2 3 4 5

1 1 1 1 13 11

2 0 0 2 6 5

어느 집단에서 표본을 10명 추출하여 다음과 같은 4개 문항에 대하여 설문조사를 실시하였다.

문항 1. 귀하의 성별은? 1) 남자 2) 여자 문항

2. 귀하의 나이는? (단위 : 세) 문항

3. 교육정도는? 1) 중졸이하 2) 고졸 3)대졸 및 그 이상 문항

4. 월수입(단위: 만원)

> ex8=read.table('ex8.txt',header=T)

> colnames(ex8)<-c('sex','age','edu','salary')

> sex.tb=table(ex8$sex)

> sex.tb

1 2

6 4

> edu.tb=table(ex8$edu)

> edu.tbe

1 2 3

2 4 4

> rownames(edu.tb)=c('중졸이하','고졸이하','대졸이상')

> edu.tb

중졸이하 고졸이하 대졸이상

2 4 4

> barplot(edu.tb)

> plot(ex8$age,ex8$salary)

> title('Scatter plot of (Age,Salary)')

> plot(ex8$age,ex8$salary,type="n")

> points(ex8$age[sex=='1'],ex8$salary[sex=='1'],pch='M',col='blue')

> points(ex8$age[sex=='2'],ex8$salary[sex=='2'],pch='F',col='red')

> title("나이, 월수입 산점도:남녀별 구분")

위의 그래프가 내가 그린 것이고, 아래는 교재. 왜 안되지?.. 좀더 공부해보자.

> edt.tb = table(survey.data$edu)

> edt.tb

1 2 3 4 5

1 1 3 19 16

> rownames(edt.tb) = c('무학','초졸','중졸','고졸','대졸')

> barplot(edu.tb)

> title('교육정도 막대그림')

> pie(edu.tb)

> title('교육정도 원그림')

Sex를 그룹변수로 하여 교육(edu)의 막대그림을 그려보도록 하자.

> sex_edu = list(survey.data$sex, survey.data$edu)

> sex_edu.tb = table(sex_edu)

> sex_edu.tb

sex_edu.2

sex_edu.1 1 2 3 4 5

1 1 1 1 13 11

2 0 0 2 6 5

> colnames(sex_edu.tb) = c('무학','초졸','중졸','고졸','대졸')

> rownames(sex_edu.tb) = c('남자','여자')

> barplot(sex_edu.tb)

> title('성별 교육 정도 막대그림')

> boxplot(salary~sex,data=survey.data,xlab='성별',ylab='소득')

R에 내장된 데이터 co2를 이용, 선으로 그려보자(lines(smooth(co2)). col에서 4는 blue를 의미.

> plot(co2)

> lines(smooth(co2),col=4)

> x<-seq(0,20,0.1)

> y<-exp(-x/10)*cos(2*x)

> plot(x,y,type='line') #or type=’l’

> library(MVA)

> library(HSAUR2)

> data(USairpollution)

> head(USairpollution)

SO2 temp manu popul wind precip predays

Albany 46 47.6 44 116 8.8 33.36 135

Albuquerque 11 56.8 46 244 8.9 7.77 58

Atlanta 24 61.5 368 497 9.1 48.34 115

Baltimore 47 55.0 625 905 9.6 41.31 111

Buffalo 11 47.1 391 463 12.4 36.11 166

Charleston 31 55.2 35 71 6.5 40.75 148

> plot(USairpollution$manu,USairpollution$popul)

> x=USairpollution[,c(3,4)]

> bvbox(x,xlab='manu',ylab='popul')

> title('bivariate boxplot') # bivariate 이변량

> identify(x)

> rownames(x)[c(7,9,14,30)]

[1] "Chicago" "Cleveland" "Detroit" "Philadelphia"

> plot(wind~temp,data=USairpollution,pch=9)

> with(USairpollution,symbols(temp,wind,circles=SO2,inches=0.5,add=T))

> title('Bubble Plot')

등고선 그림(contour)과 이미지 그림(image plot)

기상자료, 지리 위치 등을 나타내는 자료는 좌표상에 z변수를 등고선이나, 이미지 등으로 표시하게 되면 매우 효율적임

- 등고선 자료:

자료를 나타내기 위하 두 좌표의 격자(grad)위에 등고선으로 면을 그려줌

- 이미지 그림:

색깔을 이용하여 z변수의 이미지를 표현한 그래프

> x<--6:16

> x

[1] -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16

> par(mfrow=c(2,2))

> contour(outer(x,x),method='edge',vfont=c('sans serif','plain'))

> z<-outer(x,sqrt(abs(x)),FUN='/')

> image(x,x,z)

> contour(x,x,z,col='pink',add=TRUE,method='edge',vfont=c('sans serif','plain'))

> contour(x,x,z,ylim=c(1,6),method='simple',laboex=1)

Warning messages:

1: In plot.window(xlim, ylim, ...) : "laboex" is not a graphical parameter

2: In title(...) : "laboex" is not a graphical parameter

3: In axis(side = side, at = at, labels = labels, ...) :

"laboex" is not a graphical parameter

4: In axis(side = side, at = at, labels = labels, ...) :

"laboex" is not a graphical parameter

5: In box(...) : "laboex" is not a graphical parameter

> contour(x,x,z,ylim=c(-6,6),nlev=20,lty=2,method='simple')

겨냥도 그림(Perspective Plot)

등간격의 격자상에 높이 값을 갖는 행렬 자료에 대한 3차원 표현 방법 높이는 선으로 연결

> par(mfrow=c(1,2))

> x<-seq(-10,10,length=30)

> y<-x

> f<-function(x,y){r<-sqrt(x^2+y^2);10*sin(r)/r}

> z<-outer(x,y,f)

> z[is.na(z)]<-l

Error: object 'l' not found

> z[is.na(z)]<-1

> persp(x,y,z,theta=30,phi=30,expand=0.5,col='lightblue')

> persp(x,y,z,theta=30,phi=30,expand=0.5,col='lightblue',ltheta=120,shade=0.75,thicktype='detailed',xlab='X',ylab='Y',zlab='Sinc(r)')

Warning message:

In persp.default(x, y, z, theta = 30, phi = 30, expand = 0.5, col = "lightblue", :

"thicktype" is not a graphical parameter

> edu<-read.csv('education.csv')

> edu

grade y1970 y1990

1 초등졸 23.8 21.1

2 중졸 24.7 20.2

3 고졸 36.0 43.6

4 대졸 11.0 15.1

> name=edu[,1]

> grade70=edu[,2]

> grade90=edu[,3]

> par(mfrow=c(1,2))

> pie(grade70,label=name,clockwise=T,col=c(2:5),main='1970년도 여성학력')

> mtext(outer=F,'(a)',side=1,line=-6)

> pie(grade90,label=name,clockwise=T,col=c(2:5),main='1990년도 여성학력')

> mtext(outer=F,'(b)',side=1,line=-6)

연습문제

1. 13개 시중은행에 대한 편리성, 신속성, 친절, 능률, 쾌적, 자동화등의 점수. 출처: 유종렬 외 3인, S-Plus를 이용한 통계계산, 박영사, 1997

> bank<-read.csv('bank.csv',header=T)

> head(bank)

X convenience accuracy kindness efficiency pleasant automatic

1 kookmin 70.5 59.4 63.7 54.3 66.9 62.6

2 enterpr 64.8 70.3 68.6 55.2 68.0 64.1

3 boram 67.1 79.6 78.5 62.4 79.8 62.4

4 commerce 61.1 65.0 65.6 54.4 64.5 63.9

5 seoul 63.4 66.5 67.9 65.0 59.7 62.0

6 shinhan 72.3 69.1 74.2 60.0 70.1 68.2

> bank<-bank[,-1]

> name<-c('kookmin','enterpr','boram','commerce','seoul','shinhan','city','exchange','first','chohung','hana','hanil','house')

> head(bank)

convenience accuracy kindness efficiency pleasant automatic

1 70.5 59.4 63.7 54.3 66.9 62.6

2 64.8 70.3 68.6 55.2 68.0 64.1

3 67.1 79.6 78.5 62.4 79.8 62.4

4 61.1 65.0 65.6 54.4 64.5 63.9

5 63.4 66.5 67.9 65.0 59.7 62.0

6 72.3 69.1 74.2 60.0 70.1 68.2

(1) 각 변수들의 히스토그램을 그리고 설명

> library(dplyr)

> nrow(bank)

[1] 13

> bank$convenience %>% summary

Min. 1st Qu. Median Mean 3rd Qu. Max.

61.10 63.50 64.80 65.98 68.40 72.30

> bank$convenience %>% mean

[1] 65.97692

> bank$convenience %>% sd

[1] 3.279521

> bank$convenience %>% hist

> con.h<-bank %>% filter(convenience > mean(bank$convenience))

> con.l<-bank %>% filter(convenience < mean(bank$convenience))

> con.h

convenience accuracy kindness efficiency pleasant automatic

1 70.5 59.4 63.7 54.3 66.9 62.6

2 67.1 79.6 78.5 62.4 79.8 62.4

3 72.3 69.1 74.2 60.0 70.1 68.2

4 68.4 67.5 67.3 51.3 71.3 65.8

5 66.1 66.5 67.3 50.7 63.4 63.3

6 69.0 74.3 80.5 63.6 75.7 55.9

> con.h$convenience %>% summary

Min. 1st Qu. Median Mean 3rd Qu. Max.

66.10 67.42 68.70 68.90 70.12 72.30

> con.l$convenience %>% summary

Min. 1st Qu. Median Mean 3rd Qu. Max.

61.10 63.30 63.50 63.47 64.15 64.80

> con.h %>% summarise('평균 친절도' = mean(kindness))

평균 친절도

1 71.91667

> con.l %>% summarise('평균 친절도'=mean(kindness))

평균 친절도

1 67.7

2) 산점도 행렬

> plot(bank)

3) 별그림과 얼굴그림

> rownames(bank) = name

> head(bank)

convenience accuracy kindness efficiency pleasant automatic

kookmin 70.5 59.4 63.7 54.3 66.9 62.6

enterpr 64.8 70.3 68.6 55.2 68.0 64.1

boram 67.1 79.6 78.5 62.4 79.8 62.4

commerce 61.1 65.0 65.6 54.4 64.5 63.9

seoul 63.4 66.5 67.9 65.0 59.7 62.0

shinhan 72.3 69.1 74.2 60.0 70.1 68.2

> stars(bank)

> library(aplpack)

Loading required package: tcltk

> faces(bank,face.type=1)

effect of variables:

modified item Var

"height of face " "convenience"

"width of face " "accuracy"

"structure of face" "kindness"

"height of mouth " "efficiency"

"width of mouth " "pleasant"

"smiling " "automatic"

"height of eyes " "convenience"

"width of eyes " "accuracy"

"height of hair " "kindness"

"width of hair " "efficiency"

"style of hair " "pleasant"

"height of nose " "automatic"

"width of nose " "convenience"

"width of ear " "accuracy"

"height of ear " "kindness"

출처: R을 이용한 데이터 마이닝(김성수, 김현중, 정성석, 이용구 공저)

Posted by 마르띤

데이터마이너를 꿈꾸며

제1장 다변량 데이터 시각화

링크

카테고리

최근에 올라온 글

최근에 받은 트랙백

글 보관함

티스토리툴바