'facet' 태그의 글 목록

'facet'에 해당되는 글 1건

2017.07.12 Lesson 3: Explore One Variable

Lesson 3: Explore One Variable

Python, R 분석과 프로그래밍/Data Analysis with R 2017. 7. 12. 10:23

#데이터 불러오기

getwd()[1] "C:/Users/amore/Documents/FOR ME/Data Scientist/Udacity/Data Analysis with R/eda-course-materials/lesson3"
list.files()
[1] "lesson3.Rmd"         "lesson3_student.rmd" "pseudo_facebook.tsv"
pf <- read.csv('pseudo_facebook.tsv', sep='\t')
names(pf)
 [1] "userid"                "age"                   "dob_day"              
 [4] "dob_year"              "dob_month"             "gender"               
 [7] "tenure"                "friend_count"          "friendships_initiated"
[10] "likes"                 "likes_received"        "mobile_likes"         
[13] "mobile_likes_received" "www_likes"             "www_likes_received"

#Histogram of User's Birthdays

library(ggplot2)qplot(x=dob_day, data = pf)

Here's some things that I noticed. On the first day of the month I see this huge bin of almost 8,000 people. This seems really unusual since I would expect most people to have the same number of birthday's across every day of the month.

library(ggplot2)qplot(x=dob_day, data = pf) + 
  scale_x_continuous(breaks=1:31)

#Faceting

library(ggplot2)qplot(x=dob_day, data = pf) + 
  scale_x_continuous(breaks=1:31)+
  facet_wrap(~dob_month, ncol=4)

Now, you may have noticed some peaks in May or perhaps in October, but I think what's really interesting is this huge spike on January first. There's almost 4,000 users in this bin. Now, this could be because of the default settings that Facebook uses or perhaps users are choosing the first choice in the drop down menus. Another idea is that some users may want to protect their privacy and so they just go with January first by default. Whatever the case may be, I think it's important that we make our considerations in the context of our data. We want to look out for these types of anomalies.

#Friend Count

ggplot(aes(x = friend_count),data = pf) +
   geom_histogram()

#Limiting the Axes

qplot(x = friend_count, data = pf, xlim = c(0,1000))


## alternative solution
qplot(x = friend_count, data = pf) + 
  scale_x_continuous(limits = c(0,1000))

#adjust the bin width

 qplot(x = friend_count, data = pf) + 
   scale_x_continuous(limits = c(0,1000), breaks = seq(0,1000,50))

## alternative solution
qplot(x = friend_count, data = pf) + 
  scale_x_continuous(limits = c(0,1000))

##splits up the data by gender

qplot(x = friend_count, data = pf) + 
  scale_x_continuous(limits = c(0,1000), breaks = seq(0,1000,50)) + 
  facet_grid(gender~.)




#Omitting NA Obervations
qplot(x = friend_count, data = subset(pf, !is.na(gender)), 
      binwidth = 10) + 
  scale_x_continuous(limits = c(0,1000), breaks = seq(0,1000,50)) + 
  facet_grid(gender~.)

#equivalent ggplot syntax:

ggplot(aes(x = friend_count), data = pf) +
  geom_histogram() +
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
  facet_wrap(~gender)




ggplot(aes(x = friend_count), data = subset(pf, !is.na(gender))) +
  geom_histogram() +
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
  facet_wrap(~gender)

##statistics by gender

> table(pf$gender)female   male 
 40254  58574 
> by(pf$friend_count, pf$gender, summary)
pf$gender: female
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      0      37      96     242     244    4923 
--------------------------------------------------------------------- 
pf$gender: male
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      0      27      74     165     182    4917

### Tenure

qplot(x=tenure, data = pf, binwidth = 30,
      color = I('black'), fill = I('#099DD9'))



#Equivalent ggplot syntax:
ggplot(aes(x = tenure), data = pf) +
  geom_histogram(binwidth = 30, color = 'black', fill = '#099DD9')

## How would you create a histogram of tenure by year?

#create a histogram of tenure measured in years rather than in days





#It looks like the bulk of our users had less than two and a half years on Facebook.qplot(x = tenure / 365, data = pf, binwidth = 0.25, 
      color = I('black'), fill = I('#099DD9')) +
  scale_x_continuous(breaks = c(1,7,1), limits = c(0,7))
#Equivalent ggplot syntax:
ggplot(aes(x = tenure/365), data = pf) +
  geom_histogram(binwidth = .25, color = 'black', fill = '#F79420')

## Labeling Plots

qplot(x = tenure / 365, data = pf, binwidth = 0.25, 
      xlab = 'Number of years using Facebook',      ylab = 'Number of users in sample',
      color = I('black'), fill = I('#099DD9')) +
  scale_x_continuous(breaks = c(1,7,1), limits = c(0,7))


#Equivalent ggplot syntax:

ggplot(aes(x = tenure / 365), data = pf) +
  geom_histogram(color = 'black', fill = '#F79420') +
  scale_x_continuous(breaks = seq(1, 7, 1), limits = c(0, 7)) +
  xlab('Number of years using Facebook') +
  ylab('Number of users in sample')

## User Ages

> range(pf$age)
[1]  13 113
qplot(x = age, data = pf,binwidth = 5,
      xlab = 'User ages',
      ylab = 'Number of users in sample',
      color = I('black'), fill = I('blue')) +
  scale_x_continuous(breaks = c(0,113,5),limits= c(13,113))


#Equivalent ggplot syntax:

ggplot(aes(x = age), data = pf) +
  geom_histogram(binwidth = 1, fill = '#5760AB') +
  scale_x_continuous(breaks = seq(0, 113, 5))

## Transforming Data

> summary(pf$friend_count)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    0.0    31.0    82.0   196.4   206.0  4923.0 
> summary(log10(pf$friend_count))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   -Inf   1.491   1.914    -Inf   2.314   3.692 
> summary(log10(pf$friend_count + 1))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   1.505   1.919   1.868   2.316   3.692 
> summary(sqrt(pf$friend_count))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   5.568   9.055  11.090  14.350  70.160 
#Learn about how to use scales and how to create multiple plots on one page.  You'll also need to install and load the package gridExtra.
#Create 1 column thwi the folling thres histograms, friend count / transformed using  log 10 and square root.
> library(gridExtra)
> p1 <- qplot(x = friend_count, data = pf)
> p2 <- qplot(x = log10(friend_count+1), data = pf)
> p3 <- qplot(x = sqrt(friend_count), data = pf)
> grid.arrange(p1,p2,p3, ncol = 1)
#Equivalent ggplot syntax: we get the same outputs as we had before
pp1 <- ggplot(aes(x=friend_count), data = pf) + geom_histogram()
pp2 <- pp1 + scale_x_log10()
pp3 <- pp1 + scale_x_sqrt()

grid.arrange(pp1,pp2,pp3, ncol = 1)

## Add a Scaling Layer

> logScale <- qplot(x = log10(friend_count), data = pf)
> countScale <- ggplot(aes(x = friend_count), data = pf) +
  geom_histogram() +
  scale_x_log10()
> grid.arrange(logScale, countScale, ncol=2)

-> At the two plots, we can see that the difference is really in the x axis labeling. 
   Using scale_x_log10 will label the axis in actual friend_count. Where as using the 
   log10 wrapper will label the x axis in the log units. In general it is easier to 
   think about actual counts, so that's why people prefer using the scale_x_log10 as 
   a layer.

qplot(x=friend_count, data = pf) +
    scale_x_log10()

## Frequency Polygons : who has more friends on average men or women?

#This allows us to see the shape and the peaks of our distribution in more detail.qplot(x = friend_count, data = subset(pf, !is.na(gender)),
      binwidth = 10) +
  scale_x_continuous(lim = c(0,1000), breaks = seq(0,1000,50)) +
  facet_wrap(~gender)


qplot(x = friend_count, data = subset(pf, !is.na(gender)),
      binwidth = 10, geom='freqpoly', color = gender) +
  scale_x_continuous(lim = c(0,1000), breaks = seq(0,1000,50)) 



#But again, this plot doesn't really answer our question who has more friends on 
average men or women. Let's change the y-axis to show proportions instead of raw 
counts. 


qplot(x = friend_count, y = ..count.. / sum(..count..),
      data = subset(pf, !is.na(gender)),
      xlab = 'Friend Count',
      ylab = 'Proportion of Users with that firned count',
      binwidth = 10, geom='freqpoly', color = gender) +
  scale_x_continuous(lim = c(0,1000), breaks = seq(0,1000,50))

## Likes on the Web

qplot(x=www_likes, data = subset(pf, !is.na(gender)),
      geom = 'freqpoly', color = gender) +
  scale_x_continuous()+
  scale_x_log10()



# what's the www_like count for males?The first question is asking how many www_likes 
  there are in the entire data set for males.

> by(pf$www_likes, pf$gender, sum)

pf$gender: female
[1] 3507665
--------------------------------------------------------------------- 
pf$gender: male

## Box Plots

qplot(x = gender, y = friend_count,      data = subset(pf, !is.na(gender)),
      geom='boxplot')

## Adjust the code to focus on users who have friend counts between 0 and 1000.

qplot(x = gender, y = friend_count,
      data = subset(pf, !is.na(gender)),
      geom='boxplot',ylim = c(0,1000))




#same way
qplot(x = gender, y = friend_count,
      data = subset(pf, !is.na(gender)),
      geom = 'boxplot') +
scale_y_continuous(limits=c(0,1000))




-> Notice how the top of the box is just below 250, so it might be around 230. But this value might not be accurate for all of our data. use the ylim parameter or the scale_y_continious layer, we actually remove data points from calculations. 

# So a better way to do this is tu use the cord Cartesian layer to set the y limits instead.qplot(x = gender, y = friend_count,
      data = subset(pf, !is.na(gender)),
      geom = 'boxplot') +
  coord_cartesian(ylim= c(0,1000))



-> Here we will set the y limts from 0 to a 1000, notice how the top of the box has moved slightly closer to 250 for females.

## Box Plots, Quartiles, and Friendships

qplot(x = gender, y = friend_count,
      data = subset(pf, !is.na(gender)),
      geom = 'boxplot') +
  coord_cartesian(ylim = c(0,250))


> by(pf$friend_count, pf$gender, summary)
pf$gender: female
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      0      37      96     242     244    4923 
---------------------------------------------------------------------------- 
pf$gender: male
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      0      27      74     165     182    4917 
-> The third quartile of the 75% mark is at 244 and that's all the way up 
here(그래프를 보며). This means that 75% of female users have friend count below 244.
Or another way to say this is that 25% of female user have more than 244 friends.

## On average, who initiated more friendships in our sample: men or women?

> names(pf)
 [1] "userid"                "age"                   "dob_day"              
 [4] "dob_year"              "dob_month"             "gender"               
 [7] "tenure"                "friend_count"          "friendships_initiated"
[10] "likes"                 "likes_received"        "mobile_likes"         
[13] "mobile_likes_received" "www_likes"             "www_likes_received"   
qplot(x = gender, y = friendships_initiated,
      data = subset(pf, !is.na(gender)), geom = 'boxplot') +
  coord_cartesian(ylim = c(0,150))

> by(pf$friendships_initiated, pf$gender, summary)
pf$gender: female
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    0.0    19.0    49.0   113.9   124.8  3654.0 
---------------------------------------------------------------------------- 
pf$gender: male
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    0.0    15.0    44.0   103.1   111.0  4144.0 

-> On average, who initiated more friendships in our sample: men or women?
Women.

## Getting Logical : What percent of check in using mobile?

> head(pf$mobile_likes)
[1] 0 0 0 0 0 0
> summary(pf$mobile_likes)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    0.0     0.0     4.0   106.1    46.0 25110.0 
> summary(pf$mobile_likes > 0)
   Mode   FALSE    TRUE    NA's 
logical   35056   63947       0 
> mobile_check_in <- NA
> pf$mobile_check_in <- ifelse(pf$mobile_likes>0, 1, 0)
> pf$mobile_check_in <- factor(pf$mobile_check_in)
> summary(pf$mobile_check_in)
    0     1 
35056 63947 
> sum(pf$mobile_check_in == 1) / length(pf$mobile_check_in) 
[1] 0.6459097
#what percent of check in using mobile? 65%

#reference

1. https://cn.udacity.com/course/data-analysis-with-r--ud651

2. http://www.cookbook-r.com/Graphs/Facets_(ggplot2)

3. https://en.wikipedia.org/wiki/Web_colors

4. http://ggplot2.tidyverse.org/reference/theme.html

5. http://lightonphiri.org/blog/ggplot2-multiple-plots-in-one-graph-using-gridextra

6. http://ggplot2.tidyverse.org/reference/scale_continuous.html

7. https://en.wikipedia.org/wiki/Linear_regression#Assumptions

8. https://en.wikipedia.org/wiki/Normal_distribution

9. https://www.r-statistics.com/2013/05/log-transformations-for-skewed-and-wide-distributions-from-practical-data-science-with-r/

'Python, R 분석과 프로그래밍 > Data Analysis with R' 카테고리의 다른 글

Lesson2: R Basic (0)	2017.07.11

Posted by 마르띤

이전 1 다음

데이터마이너를 꿈꾸며

'facet'에 해당되는 글 1건

Lesson 3: Explore One Variable

'Python, R 분석과 프로그래밍 > Data Analysis with R' 카테고리의 다른 글

링크

카테고리

최근에 올라온 글

최근에 받은 트랙백

글 보관함

티스토리툴바