# 원하는 데이터만 발라서 보기
> data(mtcars)
> mean(mtcars$mpg)
[1] 20.09062
> subset(mtcars, mtcars$mpg >= 30 | mtcars$hp < 60)
mpg cyl disp hp drat wt qsec vs am gear carb
Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
> mtcars[mtcars$mpg >= 30 | mtcars$hp < 60, ] #column 전체를 보기 위해서는 콤마 필수
mpg cyl disp hp drat wt qsec vs am gear carb
Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
#reddit data 보기, 범주형 변수 그래프 그리기
> reddit <- read.csv('reddit.csv')
> library(ggplot2)
> qplot(data = reddit, x = age.range)
#오더 순서 정하기 setting levles of ordered factors solution
> reddit$age.range = ordered(reddit$age.range,levels=c("Under 18","18-24", "25-34","35-44","45-54","55-64","65 or Above"))
> qplot(data = reddit, x = age.range)
#alternative solution
> reddit$age.range = factor(reddit$age.range, levels=c("Under 18","18-24", "25-34","35-44","45-54","55-64","65 or Above"),ordered=T)
> qplot(data = reddit, x = age.range)
# practice
> nlevels(reddit$income.range)
[1] 8
> levels(reddit$income.range)
[1] "$100,000 - $149,999" "$150,000 or more" "$20,000 - $29,999" "$30,000 - $39,999"
[5] "$40,000 - $49,999" "$50,000 - $69,999" "$70,000 - $99,999" "Under $20,000"
> qplot(data = reddit, x = income.range)
#아래같은 방법도 가능
> reddit$income.range = ordered(reddit$income.range, levels=c("Under $20,000" , "$20,000 - $29,999" , "$30,000 - $39,999", "$40,000 - $49,999","$50,000 - $69,999" , "$70,000 - $99,999”, ”$100,000 - $149,999" , "$150,000 or more"))
> qplot(data = reddit, x = income.range)
#다른 예제
> tShirts <- factor(c('medium', 'small', 'large', 'medium', 'large', 'large'), levels = c('medium','small','large'))
> tShirts
[1] medium small large medium large large
Levels: medium small large
> qplot(x = tShirts)
> tShirts <- ordered(tShirts, levels = c('small', 'medium', 'large'))
> tShirts
[1] medium small large medium large large
Levels: small < medium < large
> qplot(x = tShirts)
참고
https://cn.udacity.com/course/data-analysis-with-r--ud651
https://cn.udacity.com/course/data-wrangling-with-mongodb--ud032
http://vita.had.co.nz/papers/tidy-data.pdf
http://courses.had.co.nz.s3-website-us-east-1.amazonaws.com/12-rice-bdsi/slides/07-tidy-data.pdf
http://www.computerworld.com/article/2497143/business-intelligence/business-intelligence-beginner-s-guide-to-r-introduction.html
http://www.statmethods.net/index.html
https://www.r-bloggers.com/
http://www.cookbook-r.com/
http://blog.revolutionanalytics.com/2013/08/foodborne-chicago.html
http://blog.yhat.com/posts/roc-curves.html
https://github.com/corynissen/foodborne_classifier
'Python, R 분석과 프로그래밍 > Data Analysis with R' 카테고리의 다른 글
Lesson 3: Explore One Variable (0) | 2017.07.12 |
---|