library(tidyverse)
library(skimr)
library(ggplot2)
library(dplyr)
library(lubridate)
library(scales)
Analyze bike usage of Cyclic customers to Identify differences between annual members and casual riders.
# Data, read one 1 file at a time, if running this online in Posit load only first
trips_2014_Q1Q2 <- read_csv("Cyclistic_Data/Divvy_Trips_2014_Q1Q2.csv")
## Rows: 905699 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): starttime, stoptime, from_station_name, to_station_name, usertype, ...
## dbl (6): trip_id, bikeid, tripduration, from_station_id, to_station_id, birt...
##
## â„ą Use `spec()` to retrieve the full column specification for this data.
## â„ą Specify the column types or set `show_col_types = FALSE` to quiet this message.
# trips_2014_Q3_07 <- read_csv("Cyclistic_Data/Divvy_Trips_2014-Q3-07.csv")
# trips_2014_Q3_0809 <- read_csv("Cyclistic_Data/Divvy_Trips_2014-Q3-0809.csv")
# trips_2014_Q4 <- read_csv("Cyclistic_Data/Divvy_Trips_2014-Q4.csv")
This analysis will be performed in the information collected for the trips that took place in the first half of the year 2014. The information collected has these characteristics:
glimpse(trips_2014_Q1Q2)
## Rows: 905,699
## Columns: 12
## $ trip_id <dbl> 2355134, 2355133, 2355130, 2355129, 2355128, 2355127…
## $ starttime <chr> "6/30/2014 23:57", "6/30/2014 23:56", "6/30/2014 23:…
## $ stoptime <chr> "7/1/2014 0:07", "7/1/2014 0:00", "6/30/2014 23:35",…
## $ bikeid <dbl> 2006, 2217, 2798, 173, 173, 2782, 1152, 941, 2316, 3…
## $ tripduration <dbl> 604, 263, 126, 3481, 638, 336, 304, 525, 203, 167, 8…
## $ from_station_id <dbl> 131, 282, 327, 134, 320, 332, 174, 199, 288, 238, 17…
## $ from_station_name <chr> "Lincoln Ave & Belmont Ave", "Halsted St & Maxwell S…
## $ to_station_id <dbl> 303, 22, 225, 194, 134, 319, 44, 24, 94, 316, 55, 91…
## $ to_station_name <chr> "Broadway & Cornelia Ave", "May St & Taylor St", "Ha…
## $ usertype <chr> "Subscriber", "Subscriber", "Subscriber", "Subscribe…
## $ gender <chr> "Male", "Male", "Male", "Female", "Female", "Male", …
## $ birthyear <dbl> 1988, 1992, 1993, 1988, 1988, 1985, 1977, 1987, NA, …
Additional information about the data:
summary(trips_2014_Q1Q2)
## trip_id starttime stoptime bikeid
## Min. :1109420 Length:905699 Length:905699 Min. : 1
## 1st Qu.:1464702 Class :character Class :character 1st Qu.: 802
## Median :1768775 Mode :character Mode :character Median :1582
## Mean :1760677 Mean :1558
## 3rd Qu.:2062163 3rd Qu.:2311
## Max. :2355134 Max. :3037
##
## tripduration from_station_id from_station_name to_station_id
## Min. : 60 Min. : 5.0 Length:905699 Min. : 5.0
## 1st Qu.: 437 1st Qu.: 74.0 Class :character 1st Qu.: 74.0
## Median : 760 Median :150.0 Mode :character Median :150.0
## Mean : 1073 Mean :160.1 Mean :160.1
## 3rd Qu.: 1256 3rd Qu.:245.0 3rd Qu.:245.0
## Max. :86245 Max. :351.0 Max. :351.0
##
## to_station_name usertype gender birthyear
## Length:905699 Length:905699 Length:905699 Min. :1898
## Class :character Class :character Class :character 1st Qu.:1972
## Mode :character Mode :character Mode :character Median :1981
## Mean :1978
## 3rd Qu.:1986
## Max. :1998
## NA's :313977
In order to run this analysis in Posit just the first half of the year 2014 will be included, to get the full analysis please open trip_analysis.R.
allData <- trips_2014_Q1Q2
The data collected has two kind of user types: customer and subscribers, the data of the subscribers don’t have year of birth and gender, to make sure all subscribers have all data we will use this query:
allData %>%
filter( is.na(birthyear) | is.na(gender)) %>%
count(usertype)
## # A tibble: 2 Ă— 2
## usertype n
## <chr> <int>
## 1 Customer 313988
## 2 Subscriber 34
To avoid using incomplete data we will remove these 34 rows:
allData <- allData %>%
filter(usertype == "Customer" | (usertype == "Subscriber" &
! is.na(birthyear) & ! is.na(gender)))
Now, lets check if the trip duration calculation is correct:
allData %>%
mutate(startime_time = mdy_hm(starttime),
stoptime_time = mdy_hm(stoptime)) %>%
mutate(newTripDuration = difftime(stoptime_time, startime_time, units = "secs"),
newTripDurationNum = as.numeric(difftime(stoptime_time, startime_time, units = "secs"))) %>%
select(starttime, stoptime, tripduration, newTripDuration, newTripDurationNum)
## # A tibble: 905,665 Ă— 5
## starttime stoptime tripduration newTripDuration newTripDurationNum
## <chr> <chr> <dbl> <drtn> <dbl>
## 1 6/30/2014 23:57 7/1/2014 0:07 604 600 secs 600
## 2 6/30/2014 23:56 7/1/2014 0:00 263 240 secs 240
## 3 6/30/2014 23:33 6/30/2014 23… 126 120 secs 120
## 4 6/30/2014 23:26 7/1/2014 0:24 3481 3480 secs 3480
## 5 6/30/2014 23:16 6/30/2014 23… 638 600 secs 600
## 6 6/30/2014 23:11 6/30/2014 23… 336 360 secs 360
## 7 6/30/2014 23:08 6/30/2014 23… 304 300 secs 300
## 8 6/30/2014 23:07 6/30/2014 23… 525 540 secs 540
## 9 6/30/2014 23:07 6/30/2014 23… 203 180 secs 180
## 10 6/30/2014 23:06 6/30/2014 23… 167 180 secs 180
## # â„ą 905,655 more rows
The trip duration is not calculated in the right way, to correct this, we will add a new column with the right calculation:
allData <- allData %>%
mutate(startime_time = mdy_hm(starttime),
stoptime_time = mdy_hm(stoptime)) %>%
mutate(newTripDuration = difftime(stoptime_time, startime_time, units = "secs"),
newTripDurationNum = as.numeric(difftime(stoptime_time, startime_time, units = "secs")))
Now here is more information about the data after adding the right trip duration:
skim_without_charts(allData)
Name | allData |
Number of rows | 905665 |
Number of columns | 16 |
_______________________ | |
Column type frequency: | |
character | 6 |
difftime | 1 |
numeric | 7 |
POSIXct | 2 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
starttime | 0 | 1.00 | 13 | 15 | 0 | 155576 | 0 |
stoptime | 0 | 1.00 | 13 | 15 | 0 | 154903 | 0 |
from_station_name | 0 | 1.00 | 12 | 36 | 0 | 300 | 0 |
to_station_name | 0 | 1.00 | 12 | 36 | 0 | 300 | 0 |
usertype | 0 | 1.00 | 8 | 10 | 0 | 2 | 0 |
gender | 313988 | 0.65 | 4 | 6 | 0 | 2 | 0 |
Variable type: difftime
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
newTripDuration | 0 | 1 | 60 secs | 86220 secs | 780 secs | 1071 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
---|---|---|---|---|---|---|---|---|---|
trip_id | 0 | 1.00 | 1760684.20 | 349441.69 | 1109420 | 1464716 | 1768786 | 2062164 | 2355134 |
bikeid | 0 | 1.00 | 1557.53 | 867.44 | 1 | 802 | 1582 | 2311 | 3037 |
tripduration | 0 | 1.00 | 1073.33 | 2152.84 | 60 | 437 | 760 | 1256 | 86245 |
from_station_id | 0 | 1.00 | 160.11 | 99.04 | 5 | 74 | 150 | 245 | 351 |
to_station_id | 0 | 1.00 | 160.12 | 99.15 | 5 | 74 | 150 | 245 | 351 |
birthyear | 313977 | 0.65 | 1978.14 | 10.42 | 1898 | 1972 | 1981 | 1986 | 1998 |
newTripDurationNum | 0 | 1.00 | 1073.36 | 2153.09 | 60 | 420 | 780 | 1260 | 86220 |
Variable type: POSIXct
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
startime_time | 0 | 1 | 2014-01-01 00:17:00 | 2014-06-30 23:57:00 | 2014-05-23 07:38:00 | 155576 |
stoptime_time | 0 | 1 | 2014-01-01 00:42:00 | 2014-07-01 13:21:00 | 2014-05-23 07:50:00 | 154903 |
Now that the have consistent data we will get some information about it, lets show the mean and the max trip duration by user type:
allData %>%
group_by(usertype) %>%
summarise(mean_tripduration = mean(newTripDurationNum),
max_tripduration = max(newTripDurationNum))
## # A tibble: 2 Ă— 3
## usertype mean_tripduration max_tripduration
## <chr> <dbl> <dbl>
## 1 Customer 1715. 86220
## 2 Subscriber 733. 85980
This is the the mean and the max trip duration by gender:
allData %>%
group_by(gender) %>%
summarise(mean_tripduration = mean(newTripDurationNum),
max_tripduration = max(newTripDurationNum))
## # A tibble: 3 Ă— 3
## gender mean_tripduration max_tripduration
## <chr> <dbl> <dbl>
## 1 Female 863. 83940
## 2 Male 694. 85980
## 3 <NA> 1715. 86220
Here is the mean and max of the data combining user type and gender:
allData %>%
group_by(usertype, gender) %>%
summarise(mean_tripduration_hrs = mean(newTripDurationNum),
max_tripduration_hrs = max(newTripDurationNum) ,
.groups = 'drop')
## # A tibble: 3 Ă— 4
## usertype gender mean_tripduration_hrs max_tripduration_hrs
## <chr> <chr> <dbl> <dbl>
## 1 Customer <NA> 1715. 86220
## 2 Subscriber Female 863. 83940
## 3 Subscriber Male 694. 85980
This is how the data is distributed by user type:
# Dataframe for usertype pie chart
df_usertype <- allData %>%
group_by(usertype) %>%
count() %>%
ungroup() %>%
mutate(perc = `n` / sum(`n`)) %>%
arrange(perc) %>%
mutate(labels = scales::percent(perc))
# Pie chart for usertype
ggplot(df_usertype, aes(x = "", y = perc, fill = usertype)) +
coord_polar(theta = "y") +
geom_col(color = "black") +
geom_text(aes(label = labels),
position = position_stack(vjust = 0.5),
show.legend = FALSE) +
guides(fill = guide_legend(title = "")) +
theme_void() +
ggtitle("User type") +
theme(plot.title = element_text(hjust = 0.5))
Subscribers distribution by gender is this:
# Dataframe for gender in subscribers pie chart
df_subsgender <- allData %>%
filter(usertype == "Subscriber") %>%
group_by(gender) %>%
count() %>%
ungroup() %>%
mutate(perc = `n` / sum(`n`)) %>%
arrange(perc) %>%
mutate(labels = scales::percent(perc))
# Pie chart for usertype
ggplot(df_subsgender, aes(x = "", y = perc, fill = gender)) +
coord_polar(theta = "y") +
geom_col(color = "black") +
geom_text(aes(label = labels),
position = position_stack(vjust = 0.5),
show.legend = FALSE) +
guides(fill = guide_legend(title = "")) +
theme_void() +
ggtitle("Subscribers by gender") +
theme(plot.title = element_text(hjust = 0.5))
Another important piece of information about the data is the distribution of subscribers by their year of birth:
# Dataframe for birthyear distribution of subscribers
options(scipen=10000)
df_subsbirthyear <- allData %>%
filter(usertype == "Subscriber") %>%
group_by(period_birth=cut(birthyear,
breaks = c(1890,1940,1960,1980,2000),
labels = c("1890-1940", "1941-1960", "1961-1980", "1981-2000")))
# Graph of subscribers by birthyear
ggplot(data = df_subsbirthyear, aes (x = period_birth)) +
geom_bar(fill="lightblue") +
geom_text(stat = "count",aes(label = paste0(after_stat(count)
," (",scales::percent(after_stat(count)/ sum(after_stat(count))),")")
, vjust = -0.5)) +
theme_minimal() +
ggtitle("Subscribers by birth year") +
theme(plot.title = element_text(hjust = 0.5),
axis.title.y = element_blank()) +
xlab("Birth year")
Finally the birth of year distribution by gender will be this:
# Graph of subscribers by birthyear and gender
ggplot(data = df_subsbirthyear, aes (x = period_birth, fill = gender)) +
geom_bar() +
geom_text(stat = "count",aes(label = paste0(after_stat(count)
," (",scales::percent(after_stat(count)/ sum(after_stat(count))),")")
, vjust = -0.5)) +
theme_minimal() +
ggtitle("Subscribers by sex and birth year") +
guides(fill = guide_legend(title = "")) +
theme(plot.title = element_text(hjust = 0.5),
axis.title.y = element_blank()) +
xlab("Birth year")