library(tidyverse)
library(skimr)
library(ggplot2)
library(dplyr)
library(lubridate)
library(scales)

Cyclistic case study

Business task

Analyze bike usage of Cyclic customers to Identify differences between annual members and casual riders.

Data sources

# Data, read one 1 file at a time, if running this online in Posit load only first 
trips_2014_Q1Q2 <- read_csv("Cyclistic_Data/Divvy_Trips_2014_Q1Q2.csv")
## Rows: 905699 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): starttime, stoptime, from_station_name, to_station_name, usertype, ...
## dbl (6): trip_id, bikeid, tripduration, from_station_id, to_station_id, birt...
## 
## â„ą Use `spec()` to retrieve the full column specification for this data.
## â„ą Specify the column types or set `show_col_types = FALSE` to quiet this message.
# trips_2014_Q3_07 <- read_csv("Cyclistic_Data/Divvy_Trips_2014-Q3-07.csv")
# trips_2014_Q3_0809 <- read_csv("Cyclistic_Data/Divvy_Trips_2014-Q3-0809.csv")
# trips_2014_Q4 <- read_csv("Cyclistic_Data/Divvy_Trips_2014-Q4.csv")

This analysis will be performed in the information collected for the trips that took place in the first half of the year 2014. The information collected has these characteristics:

glimpse(trips_2014_Q1Q2)
## Rows: 905,699
## Columns: 12
## $ trip_id           <dbl> 2355134, 2355133, 2355130, 2355129, 2355128, 2355127…
## $ starttime         <chr> "6/30/2014 23:57", "6/30/2014 23:56", "6/30/2014 23:…
## $ stoptime          <chr> "7/1/2014 0:07", "7/1/2014 0:00", "6/30/2014 23:35",…
## $ bikeid            <dbl> 2006, 2217, 2798, 173, 173, 2782, 1152, 941, 2316, 3…
## $ tripduration      <dbl> 604, 263, 126, 3481, 638, 336, 304, 525, 203, 167, 8…
## $ from_station_id   <dbl> 131, 282, 327, 134, 320, 332, 174, 199, 288, 238, 17…
## $ from_station_name <chr> "Lincoln Ave & Belmont Ave", "Halsted St & Maxwell S…
## $ to_station_id     <dbl> 303, 22, 225, 194, 134, 319, 44, 24, 94, 316, 55, 91…
## $ to_station_name   <chr> "Broadway & Cornelia Ave", "May St & Taylor St", "Ha…
## $ usertype          <chr> "Subscriber", "Subscriber", "Subscriber", "Subscribe…
## $ gender            <chr> "Male", "Male", "Male", "Female", "Female", "Male", …
## $ birthyear         <dbl> 1988, 1992, 1993, 1988, 1988, 1985, 1977, 1987, NA, …

Additional information about the data:

summary(trips_2014_Q1Q2)
##     trip_id         starttime           stoptime             bikeid    
##  Min.   :1109420   Length:905699      Length:905699      Min.   :   1  
##  1st Qu.:1464702   Class :character   Class :character   1st Qu.: 802  
##  Median :1768775   Mode  :character   Mode  :character   Median :1582  
##  Mean   :1760677                                         Mean   :1558  
##  3rd Qu.:2062163                                         3rd Qu.:2311  
##  Max.   :2355134                                         Max.   :3037  
##                                                                        
##   tripduration   from_station_id from_station_name  to_station_id  
##  Min.   :   60   Min.   :  5.0   Length:905699      Min.   :  5.0  
##  1st Qu.:  437   1st Qu.: 74.0   Class :character   1st Qu.: 74.0  
##  Median :  760   Median :150.0   Mode  :character   Median :150.0  
##  Mean   : 1073   Mean   :160.1                      Mean   :160.1  
##  3rd Qu.: 1256   3rd Qu.:245.0                      3rd Qu.:245.0  
##  Max.   :86245   Max.   :351.0                      Max.   :351.0  
##                                                                    
##  to_station_name      usertype            gender            birthyear     
##  Length:905699      Length:905699      Length:905699      Min.   :1898    
##  Class :character   Class :character   Class :character   1st Qu.:1972    
##  Mode  :character   Mode  :character   Mode  :character   Median :1981    
##                                                           Mean   :1978    
##                                                           3rd Qu.:1986    
##                                                           Max.   :1998    
##                                                           NA's   :313977

In order to run this analysis in Posit just the first half of the year 2014 will be included, to get the full analysis please open trip_analysis.R.

allData <- trips_2014_Q1Q2

The data collected has two kind of user types: customer and subscribers, the data of the subscribers don’t have year of birth and gender, to make sure all subscribers have all data we will use this query:

allData %>%
  filter( is.na(birthyear) |  is.na(gender)) %>%
  count(usertype)
## # A tibble: 2 Ă— 2
##   usertype        n
##   <chr>       <int>
## 1 Customer   313988
## 2 Subscriber     34

To avoid using incomplete data we will remove these 34 rows:

allData <- allData %>%
  filter(usertype == "Customer" | (usertype == "Subscriber" &  
                                     ! is.na(birthyear) & ! is.na(gender)))

Now, lets check if the trip duration calculation is correct:

allData %>%
  mutate(startime_time = mdy_hm(starttime),
         stoptime_time = mdy_hm(stoptime)) %>%
  mutate(newTripDuration = difftime(stoptime_time, startime_time, units = "secs"),
         newTripDurationNum = as.numeric(difftime(stoptime_time, startime_time, units = "secs"))) %>% 
  select(starttime, stoptime, tripduration, newTripDuration, newTripDurationNum)
## # A tibble: 905,665 Ă— 5
##    starttime       stoptime      tripduration newTripDuration newTripDurationNum
##    <chr>           <chr>                <dbl> <drtn>                       <dbl>
##  1 6/30/2014 23:57 7/1/2014 0:07          604  600 secs                      600
##  2 6/30/2014 23:56 7/1/2014 0:00          263  240 secs                      240
##  3 6/30/2014 23:33 6/30/2014 23…          126  120 secs                      120
##  4 6/30/2014 23:26 7/1/2014 0:24         3481 3480 secs                     3480
##  5 6/30/2014 23:16 6/30/2014 23…          638  600 secs                      600
##  6 6/30/2014 23:11 6/30/2014 23…          336  360 secs                      360
##  7 6/30/2014 23:08 6/30/2014 23…          304  300 secs                      300
##  8 6/30/2014 23:07 6/30/2014 23…          525  540 secs                      540
##  9 6/30/2014 23:07 6/30/2014 23…          203  180 secs                      180
## 10 6/30/2014 23:06 6/30/2014 23…          167  180 secs                      180
## # â„ą 905,655 more rows

The trip duration is not calculated in the right way, to correct this, we will add a new column with the right calculation:

allData <- allData %>%
  mutate(startime_time = mdy_hm(starttime),
         stoptime_time = mdy_hm(stoptime)) %>%
  mutate(newTripDuration = difftime(stoptime_time, startime_time, units = "secs"),
         newTripDurationNum = as.numeric(difftime(stoptime_time, startime_time, units = "secs")))

Now here is more information about the data after adding the right trip duration:

skim_without_charts(allData)
Data summary
Name allData
Number of rows 905665
Number of columns 16
_______________________
Column type frequency:
character 6
difftime 1
numeric 7
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
starttime 0 1.00 13 15 0 155576 0
stoptime 0 1.00 13 15 0 154903 0
from_station_name 0 1.00 12 36 0 300 0
to_station_name 0 1.00 12 36 0 300 0
usertype 0 1.00 8 10 0 2 0
gender 313988 0.65 4 6 0 2 0

Variable type: difftime

skim_variable n_missing complete_rate min max median n_unique
newTripDuration 0 1 60 secs 86220 secs 780 secs 1071

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
trip_id 0 1.00 1760684.20 349441.69 1109420 1464716 1768786 2062164 2355134
bikeid 0 1.00 1557.53 867.44 1 802 1582 2311 3037
tripduration 0 1.00 1073.33 2152.84 60 437 760 1256 86245
from_station_id 0 1.00 160.11 99.04 5 74 150 245 351
to_station_id 0 1.00 160.12 99.15 5 74 150 245 351
birthyear 313977 0.65 1978.14 10.42 1898 1972 1981 1986 1998
newTripDurationNum 0 1.00 1073.36 2153.09 60 420 780 1260 86220

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
startime_time 0 1 2014-01-01 00:17:00 2014-06-30 23:57:00 2014-05-23 07:38:00 155576
stoptime_time 0 1 2014-01-01 00:42:00 2014-07-01 13:21:00 2014-05-23 07:50:00 154903

Information about the data

Now that the have consistent data we will get some information about it, lets show the mean and the max trip duration by user type:

allData %>% 
  group_by(usertype) %>% 
  summarise(mean_tripduration = mean(newTripDurationNum),
            max_tripduration = max(newTripDurationNum))
## # A tibble: 2 Ă— 3
##   usertype   mean_tripduration max_tripduration
##   <chr>                  <dbl>            <dbl>
## 1 Customer               1715.            86220
## 2 Subscriber              733.            85980

This is the the mean and the max trip duration by gender:

allData %>% 
  group_by(gender) %>% 
  summarise(mean_tripduration = mean(newTripDurationNum),
            max_tripduration = max(newTripDurationNum))
## # A tibble: 3 Ă— 3
##   gender mean_tripduration max_tripduration
##   <chr>              <dbl>            <dbl>
## 1 Female              863.            83940
## 2 Male                694.            85980
## 3 <NA>               1715.            86220

Here is the mean and max of the data combining user type and gender:

allData %>% 
  group_by(usertype, gender) %>% 
  summarise(mean_tripduration_hrs = mean(newTripDurationNum), 
            max_tripduration_hrs = max(newTripDurationNum) ,
            .groups = 'drop')
## # A tibble: 3 Ă— 4
##   usertype   gender mean_tripduration_hrs max_tripduration_hrs
##   <chr>      <chr>                  <dbl>                <dbl>
## 1 Customer   <NA>                   1715.                86220
## 2 Subscriber Female                  863.                83940
## 3 Subscriber Male                    694.                85980

Graphical analysis of the data

This is how the data is distributed by user type:

# Dataframe for usertype pie chart
df_usertype <- allData %>%
  group_by(usertype) %>%
  count() %>%
  ungroup() %>%
  mutate(perc = `n` / sum(`n`)) %>%
  arrange(perc) %>%
  mutate(labels = scales::percent(perc))

# Pie chart for usertype
ggplot(df_usertype, aes(x = "", y = perc, fill = usertype)) +
  coord_polar(theta = "y") + 
  geom_col(color = "black") +
  geom_text(aes(label = labels),
            position = position_stack(vjust = 0.5),
            show.legend = FALSE) +
  guides(fill = guide_legend(title = "")) +
  theme_void() + 
  ggtitle("User type") + 
  theme(plot.title = element_text(hjust = 0.5))

Subscribers distribution by gender is this:

# Dataframe for gender in subscribers pie chart
df_subsgender <- allData %>%
  filter(usertype == "Subscriber") %>%
  group_by(gender) %>%
  count() %>%
  ungroup() %>%
  mutate(perc = `n` / sum(`n`)) %>%
  arrange(perc) %>%
  mutate(labels = scales::percent(perc))

# Pie chart for usertype
ggplot(df_subsgender, aes(x = "", y = perc, fill = gender)) +
  coord_polar(theta = "y") + 
  geom_col(color = "black") +
  geom_text(aes(label = labels),
            position = position_stack(vjust = 0.5),
            show.legend = FALSE) +
  guides(fill = guide_legend(title = "")) +
  theme_void() + 
  ggtitle("Subscribers by gender") + 
  theme(plot.title = element_text(hjust = 0.5))

Another important piece of information about the data is the distribution of subscribers by their year of birth:

# Dataframe for birthyear distribution of subscribers
options(scipen=10000)
df_subsbirthyear <- allData %>%
  filter(usertype == "Subscriber") %>%
  group_by(period_birth=cut(birthyear,
                  breaks = c(1890,1940,1960,1980,2000), 
                  labels = c("1890-1940", "1941-1960", "1961-1980", "1981-2000")))

# Graph of subscribers by birthyear
ggplot(data = df_subsbirthyear, aes (x = period_birth)) +
  geom_bar(fill="lightblue") + 
  geom_text(stat = "count",aes(label = paste0(after_stat(count)
    ," (",scales::percent(after_stat(count)/ sum(after_stat(count))),")")
    , vjust = -0.5)) +
  theme_minimal() +
  ggtitle("Subscribers by birth year") + 
  theme(plot.title = element_text(hjust = 0.5),
        axis.title.y = element_blank()) +
  xlab("Birth year")

Finally the birth of year distribution by gender will be this:

# Graph of subscribers by birthyear and gender
ggplot(data = df_subsbirthyear, aes (x = period_birth, fill = gender)) +
  geom_bar() + 
  geom_text(stat = "count",aes(label = paste0(after_stat(count)
    ," (",scales::percent(after_stat(count)/ sum(after_stat(count))),")")
    , vjust = -0.5)) +
  theme_minimal() +
  ggtitle("Subscribers by sex and birth year") + 
  guides(fill = guide_legend(title = "")) +
  theme(plot.title = element_text(hjust = 0.5),
        axis.title.y = element_blank()) +
  xlab("Birth year")