Cyclistic Bike Share Report

library(tidyverse)
library(skimr)
library(ggplot2)
library(dplyr)
library(lubridate)
library(scales)

Cyclistic case study

Business task

Analyze bike usage of Cyclic customers to Identify differences between annual members and casual riders.

Data sources

# Data, read one 1 file at a time, if running this online in Posit load only first 
trips_2014_Q1Q2 <- read_csv("Cyclistic_Data/Divvy_Trips_2014_Q1Q2.csv")

## Rows: 905699 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): starttime, stoptime, from_station_name, to_station_name, usertype, ...
## dbl (6): trip_id, bikeid, tripduration, from_station_id, to_station_id, birt...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# trips_2014_Q3_07 <- read_csv("Cyclistic_Data/Divvy_Trips_2014-Q3-07.csv")
# trips_2014_Q3_0809 <- read_csv("Cyclistic_Data/Divvy_Trips_2014-Q3-0809.csv")
# trips_2014_Q4 <- read_csv("Cyclistic_Data/Divvy_Trips_2014-Q4.csv")

This analysis will be performed in the information collected for the trips that took place in the first half of the year 2014. The information collected has these characteristics:

glimpse(trips_2014_Q1Q2)

## Rows: 905,699
## Columns: 12
## $ trip_id           <dbl> 2355134, 2355133, 2355130, 2355129, 2355128, 2355127…
## $ starttime         <chr> "6/30/2014 23:57", "6/30/2014 23:56", "6/30/2014 23:…
## $ stoptime          <chr> "7/1/2014 0:07", "7/1/2014 0:00", "6/30/2014 23:35",…
## $ bikeid            <dbl> 2006, 2217, 2798, 173, 173, 2782, 1152, 941, 2316, 3…
## $ tripduration      <dbl> 604, 263, 126, 3481, 638, 336, 304, 525, 203, 167, 8…
## $ from_station_id   <dbl> 131, 282, 327, 134, 320, 332, 174, 199, 288, 238, 17…
## $ from_station_name <chr> "Lincoln Ave & Belmont Ave", "Halsted St & Maxwell S…
## $ to_station_id     <dbl> 303, 22, 225, 194, 134, 319, 44, 24, 94, 316, 55, 91…
## $ to_station_name   <chr> "Broadway & Cornelia Ave", "May St & Taylor St", "Ha…
## $ usertype          <chr> "Subscriber", "Subscriber", "Subscriber", "Subscribe…
## $ gender            <chr> "Male", "Male", "Male", "Female", "Female", "Male", …
## $ birthyear         <dbl> 1988, 1992, 1993, 1988, 1988, 1985, 1977, 1987, NA, …

Additional information about the data:

summary(trips_2014_Q1Q2)

##     trip_id         starttime           stoptime             bikeid    
##  Min.   :1109420   Length:905699      Length:905699      Min.   :   1  
##  1st Qu.:1464702   Class :character   Class :character   1st Qu.: 802  
##  Median :1768775   Mode  :character   Mode  :character   Median :1582  
##  Mean   :1760677                                         Mean   :1558  
##  3rd Qu.:2062163                                         3rd Qu.:2311  
##  Max.   :2355134                                         Max.   :3037  
##                                                                        
##   tripduration   from_station_id from_station_name  to_station_id  
##  Min.   :   60   Min.   :  5.0   Length:905699      Min.   :  5.0  
##  1st Qu.:  437   1st Qu.: 74.0   Class :character   1st Qu.: 74.0  
##  Median :  760   Median :150.0   Mode  :character   Median :150.0  
##  Mean   : 1073   Mean   :160.1                      Mean   :160.1  
##  3rd Qu.: 1256   3rd Qu.:245.0                      3rd Qu.:245.0  
##  Max.   :86245   Max.   :351.0                      Max.   :351.0  
##                                                                    
##  to_station_name      usertype            gender            birthyear     
##  Length:905699      Length:905699      Length:905699      Min.   :1898    
##  Class :character   Class :character   Class :character   1st Qu.:1972    
##  Mode  :character   Mode  :character   Mode  :character   Median :1981    
##                                                           Mean   :1978    
##                                                           3rd Qu.:1986    
##                                                           Max.   :1998    
##                                                           NA's   :313977

In order to run this analysis in Posit just the first half of the year 2014 will be included, to get the full analysis please open trip_analysis.R.

allData <- trips_2014_Q1Q2

The data collected has two kind of user types: customer and subscribers, the data of the subscribers don’t have year of birth and gender, to make sure all subscribers have all data we will use this query:

allData %>%
  filter( is.na(birthyear) |  is.na(gender)) %>%
  count(usertype)

## # A tibble: 2 × 2
##   usertype        n
##   <chr>       <int>
## 1 Customer   313988
## 2 Subscriber     34

To avoid using incomplete data we will remove these 34 rows:

allData <- allData %>%
  filter(usertype == "Customer" | (usertype == "Subscriber" &  
                                     ! is.na(birthyear) & ! is.na(gender)))

Now, lets check if the trip duration calculation is correct:

allData %>%
  mutate(startime_time = mdy_hm(starttime),
         stoptime_time = mdy_hm(stoptime)) %>%
  mutate(newTripDuration = difftime(stoptime_time, startime_time, units = "secs"),
         newTripDurationNum = as.numeric(difftime(stoptime_time, startime_time, units = "secs"))) %>% 
  select(starttime, stoptime, tripduration, newTripDuration, newTripDurationNum)

## # A tibble: 905,665 × 5
##    starttime       stoptime      tripduration newTripDuration newTripDurationNum
##    <chr>           <chr>                <dbl> <drtn>                       <dbl>
##  1 6/30/2014 23:57 7/1/2014 0:07          604  600 secs                      600
##  2 6/30/2014 23:56 7/1/2014 0:00          263  240 secs                      240
##  3 6/30/2014 23:33 6/30/2014 23…          126  120 secs                      120
##  4 6/30/2014 23:26 7/1/2014 0:24         3481 3480 secs                     3480
##  5 6/30/2014 23:16 6/30/2014 23…          638  600 secs                      600
##  6 6/30/2014 23:11 6/30/2014 23…          336  360 secs                      360
##  7 6/30/2014 23:08 6/30/2014 23…          304  300 secs                      300
##  8 6/30/2014 23:07 6/30/2014 23…          525  540 secs                      540
##  9 6/30/2014 23:07 6/30/2014 23…          203  180 secs                      180
## 10 6/30/2014 23:06 6/30/2014 23…          167  180 secs                      180
## # ℹ 905,655 more rows

The trip duration is not calculated in the right way, to correct this, we will add a new column with the right calculation:

allData <- allData %>%
  mutate(startime_time = mdy_hm(starttime),
         stoptime_time = mdy_hm(stoptime)) %>%
  mutate(newTripDuration = difftime(stoptime_time, startime_time, units = "secs"),
         newTripDurationNum = as.numeric(difftime(stoptime_time, startime_time, units = "secs")))

Now here is more information about the data after adding the right trip duration:

skim_without_charts(allData)

Data summary
Name	allData
Number of rows	905665
Number of columns	16
_______________________
Column type frequency:
character	6
difftime	1
numeric	7
POSIXct	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
starttime	0	1.00	13	15	155576
stoptime	0	1.00	13	15	154903
from_station_name	0	1.00	12	36	300
to_station_name	0	1.00	12	36	300
usertype	0	1.00	8	10	2
gender	313988	0.65	4	6	2

Variable type: difftime

skim_variable	n_missing	complete_rate	min	max	median	n_unique
newTripDuration	0	1	60 secs	86220 secs	780 secs	1071

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100
trip_id	0	1.00	1760684.20	349441.69	1109420	1464716	1768786	2062164	2355134
bikeid	0	1.00	1557.53	867.44	1	802	1582	2311	3037
tripduration	0	1.00	1073.33	2152.84	60	437	760	1256	86245
from_station_id	0	1.00	160.11	99.04	5	74	150	245	351
to_station_id	0	1.00	160.12	99.15	5	74	150	245	351
birthyear	313977	0.65	1978.14	10.42	1898	1972	1981	1986	1998
newTripDurationNum	0	1.00	1073.36	2153.09	60	420	780	1260	86220

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
startime_time	0	1	2014-01-01 00:17:00	2014-06-30 23:57:00	2014-05-23 07:38:00	155576
stoptime_time	0	1	2014-01-01 00:42:00	2014-07-01 13:21:00	2014-05-23 07:50:00	154903

Information about the data

Now that the have consistent data we will get some information about it, lets show the mean and the max trip duration by user type:

allData %>% 
  group_by(usertype) %>% 
  summarise(mean_tripduration = mean(newTripDurationNum),
            max_tripduration = max(newTripDurationNum))

## # A tibble: 2 × 3
##   usertype   mean_tripduration max_tripduration
##   <chr>                  <dbl>            <dbl>
## 1 Customer               1715.            86220
## 2 Subscriber              733.            85980

This is the the mean and the max trip duration by gender:

allData %>% 
  group_by(gender) %>% 
  summarise(mean_tripduration = mean(newTripDurationNum),
            max_tripduration = max(newTripDurationNum))

## # A tibble: 3 × 3
##   gender mean_tripduration max_tripduration
##   <chr>              <dbl>            <dbl>
## 1 Female              863.            83940
## 2 Male                694.            85980
## 3 <NA>               1715.            86220

Here is the mean and max of the data combining user type and gender:

allData %>% 
  group_by(usertype, gender) %>% 
  summarise(mean_tripduration_hrs = mean(newTripDurationNum), 
            max_tripduration_hrs = max(newTripDurationNum) ,
            .groups = 'drop')

## # A tibble: 3 × 4
##   usertype   gender mean_tripduration_hrs max_tripduration_hrs
##   <chr>      <chr>                  <dbl>                <dbl>
## 1 Customer   <NA>                   1715.                86220
## 2 Subscriber Female                  863.                83940
## 3 Subscriber Male                    694.                85980

Graphical analysis of the data

This is how the data is distributed by user type:

# Dataframe for usertype pie chart
df_usertype <- allData %>%
  group_by(usertype) %>%
  count() %>%
  ungroup() %>%
  mutate(perc = `n` / sum(`n`)) %>%
  arrange(perc) %>%
  mutate(labels = scales::percent(perc))

# Pie chart for usertype
ggplot(df_usertype, aes(x = "", y = perc, fill = usertype)) +
  coord_polar(theta = "y") + 
  geom_col(color = "black") +
  geom_text(aes(label = labels),
            position = position_stack(vjust = 0.5),
            show.legend = FALSE) +
  guides(fill = guide_legend(title = "")) +
  theme_void() + 
  ggtitle("User type") + 
  theme(plot.title = element_text(hjust = 0.5))

Subscribers distribution by gender is this:

# Dataframe for gender in subscribers pie chart
df_subsgender <- allData %>%
  filter(usertype == "Subscriber") %>%
  group_by(gender) %>%
  count() %>%
  ungroup() %>%
  mutate(perc = `n` / sum(`n`)) %>%
  arrange(perc) %>%
  mutate(labels = scales::percent(perc))

# Pie chart for usertype
ggplot(df_subsgender, aes(x = "", y = perc, fill = gender)) +
  coord_polar(theta = "y") + 
  geom_col(color = "black") +
  geom_text(aes(label = labels),
            position = position_stack(vjust = 0.5),
            show.legend = FALSE) +
  guides(fill = guide_legend(title = "")) +
  theme_void() + 
  ggtitle("Subscribers by gender") + 
  theme(plot.title = element_text(hjust = 0.5))

Another important piece of information about the data is the distribution of subscribers by their year of birth:

# Dataframe for birthyear distribution of subscribers
options(scipen=10000)
df_subsbirthyear <- allData %>%
  filter(usertype == "Subscriber") %>%
  group_by(period_birth=cut(birthyear,
                  breaks = c(1890,1940,1960,1980,2000), 
                  labels = c("1890-1940", "1941-1960", "1961-1980", "1981-2000")))

# Graph of subscribers by birthyear
ggplot(data = df_subsbirthyear, aes (x = period_birth)) +
  geom_bar(fill="lightblue") + 
  geom_text(stat = "count",aes(label = paste0(after_stat(count)
    ," (",scales::percent(after_stat(count)/ sum(after_stat(count))),")")
    , vjust = -0.5)) +
  theme_minimal() +
  ggtitle("Subscribers by birth year") + 
  theme(plot.title = element_text(hjust = 0.5),
        axis.title.y = element_blank()) +
  xlab("Birth year")

Finally the birth of year distribution by gender will be this:

# Graph of subscribers by birthyear and gender
ggplot(data = df_subsbirthyear, aes (x = period_birth, fill = gender)) +
  geom_bar() + 
  geom_text(stat = "count",aes(label = paste0(after_stat(count)
    ," (",scales::percent(after_stat(count)/ sum(after_stat(count))),")")
    , vjust = -0.5)) +
  theme_minimal() +
  ggtitle("Subscribers by sex and birth year") + 
  guides(fill = guide_legend(title = "")) +
  theme(plot.title = element_text(hjust = 0.5),
        axis.title.y = element_blank()) +
  xlab("Birth year")