```{r setup, echo=FALSE, warning=FALSE, message=FALSE}
knitr::opts_chunk$set(
echo = F,
collapse = TRUE,
message = FALSE,
warning = FALSE,
out.width = "70%",
fig.align = 'center',
fig.width = 7,
fig.asp = 0.618, # 1 / phi
fig.show = "hold"
)
library(assertthat)
library(tidyverse)
library(rvest)
library(lubridate)
library(stringr)
library(rmarkdown)
library(dygraphs)
library(xts)
# Stored by default in C:/Users/}/.R/gargle/gargle-oauth
options(
gargle_oauth_cache = "/config",
googleAuthR.scopes.selected = "https://www.googleapis.com/auth/analytics.readonly" # Readonly scope is required for certain management API calls. See https://developers.google.com/analytics/devguides/config/mgmt/v3/authorization#OAuth2Authorizing
)
library(googleAuthR)
# Service Account Credentials
# gar_auth_service(json_file=Sys.getenv("SERVICE_JSON"))
library(googleAnalyticsR)
# Alternatively, an OAuth client and email can be provided
gar_set_client(json = Sys.getenv("CLIENT_JSON"))
ga_auth(email=Sys.getenv("CLIENT_EMAIL"))
theme_set(theme_minimal() + theme(text=element_text(size=16)))
# Select an account ID and view ID to begin
account_id <- Sys.getenv("ACCOUNT_ID")
view_id <- Sys.getenv("VIEW_ID")
```
```{r property and view data, message=FALSE, warning=FALSE}
accounts <- ga_accounts()
cur_account <- accounts %>% filter(id == account_id)
property_df <- ga_webproperty_list(account_id)
views_df <- bind_rows(lapply(property_df$id, function(x){ga_view_list(accountId = account_id, webPropertyId = x)})) %>% left_join(property_df,by=c("webPropertyId"="id"))
# View-specific
view_name <- (views_df %>% filter(id == view_id))$name.x
view_url <- (views_df %>% filter(id == view_id))$websiteUrl.x
property_url <- (views_df %>% filter(id == view_id))$websiteUrl.y
property_name <- (views_df %>% filter(id == view_id))$name.y
property_id <- (views_df %>% filter(id == view_id))$webPropertyId
custom_dimensions_df <- ga_custom_vars_list(accountId = account_id, webPropertyId = property_id)
goals_df <- ga_goal_list(webPropertyId = property_id, profileId = view_id, accountId=account_id)
view_filters_df <- ga_filter_view_list(webPropertyId = property_id, viewId = view_id,accountId = account_id)
if(nrow(view_filters_df) > 0){
view_filters_df <- view_filters_df %>% select(filterRef.id, profileRef.id, profileRef.name, rank) %>% left_join(ga_filter_list(account_id), by=c("filterRef.id" = "id"))
}
```
# Account Structure
## Properties
The account, __`r cur_account$name`__, has `r nrow(property_df)` properties. These are shown below sorted by update date in reverse chronological order.
```{r rows.print=5,cols.min.print=3}
property_df %>% arrange(desc(updated)) %>% select(Property=name, URL=websiteUrl, `Updated Date`=updated)
```
## Views
```{r}
property_with_most_views <- views_df %>% group_by(`name.y`) %>% tally() %>% arrange(desc(n)) %>% ungroup() %>% filter(row_number() == 1)
```
In total, there are `r nrow(views_df)` views. These are shown below in reverse chronological order by update date.
```{r rows.print=5,cols.min.print=3}
views_df %>% arrange(desc(updated.x)) %>% select(View=name.x, Property=name.y, `Update Date`=updated.x)
```
TODO - only need to show this if there is more than 1 property
The property with the most views is __`r property_with_most_views$name.y`__ with `r property_with_most_views$n` views. The number of views per property is below.
```{r fig.height = 4}
views_df %>% group_by(webPropertyId, name.y) %>% tally() %>% select(`Property ID`=webPropertyId,Name=name.y,`View Count`=n) %>% arrange(desc(`View Count`))
```
# Configuration
## Custom Dimensions
```{r}
no_custom_dimensions <- nrow(custom_dimensions_df) == 0
```
The view, __`r view_name`__, has no associated custom dimensions
Property __`r property_name`__ has `r nrow(custom_dimensions_df)` custom dimensions shown below:
```{r}
if(!no_custom_dimensions)
{
custom_dimensions_df %>% select(ID = index, Name = name, Scope = scope, `Is Active?`=active)
}
```
```{r}
# Check to see if there are any goals and change display of markdown accordingly
no_goals <- nrow(goals_df) == 0
no_destination_goals <- F
no_event_goals <- F
if(!no_goals)
{
no_destination_goals <- nrow(goals_df %>% filter(type == "URL_DESTINATION")) == 0
no_event_goals <- nrow(goals_df %>% filter(type == "EVENT")) == 0
}
```
## Goals {.tabset .tabset-fade .tabset-pills}
### All Goals (`r nrow(goals_df)`)
View __`r view_name`__ has `r nrow(goals_df)` goals listed below.
```{r rows.print=5}
if(!no_goals)
{
goals_df %>% select(Goal = name, Type = type, `Is Active?`=active)
}
```
### Destination Goals (`r if(nrow(goals_df) != 0){nrow(goals_df %>% filter(type == "URL_DESTINATION"))}else{ integer(1)}`)
```{r}
if(!no_goals && !no_destination_goals)
{
goals_df %>% filter(type == "URL_DESTINATION") %>% select(Goal = name,URL = urlDestinationDetails.url)
}
```
### Event Goals (`r if(nrow(goals_df) != 0){nrow(goals_df %>% filter(type == "EVENT"))}else{integer(1)}`)
```{r}
if(!no_goals && !no_event_goals)
{
goals_df %>% filter(type == "EVENT") %>% select(Goal = name,`Event Details` = eventDetails.eventConditions)
}
```
```{r}
# Check to see if there are any goals and change display of markdown accordingly
no_filters <- nrow(view_filters_df) == 0
no_include_exclude_filters <- T
if(!no_filters)
{
no_include_exclude_filters <- nrow(view_filters_df %>% filter(type == "INCLUDE" | type == "EXCLUDE")) == 0
}
```
## Filters {.tabset .tabset-fade .tabset-pills}
### All Filters (`r nrow(view_filters_df)`)
View __`r view_name`__ has `r nrow(view_filters_df)` filters listed below.
```{r rows.print=7}
if(!no_filters)
{
view_filters_df %>% select(Filter = name, Type = type)
}
```
### Include Filters (`r if(!no_filters){nrow(view_filters_df %>% filter(type == "INCLUDE"))}else{integer(1)}`)
Include/Exclude filters are called out below along with their criteria.
```{r rows.print=7}
if(!no_filters && nrow(view_filters_df %>% filter(type == "INCLUDE")) > 0)
{
view_filters_df %>% filter(type=="INCLUDE") %>% mutate(Criteria = if_else(type == "INCLUDE",paste(includeDetails.field,includeDetails.matchType,includeDetails.expressionValue),paste(excludeDetails.field,excludeDetails.matchType,excludeDetails.expressionValue))) %>% select(Filter = name, Type = type, Criteria)
}
```
### Exclude Filters (`r if(!no_filters){nrow(view_filters_df %>% filter(type == "EXCLUDE"))}else{integer(1)}`)
Include/Exclude filters are called out below along with their criteria.
```{r rows.print=7}
if(!no_filters && nrow(view_filters_df %>% filter(type == "EXCLUDE")) > 0)
{
view_filters_df %>% filter(type=="EXCLUDE") %>% mutate(Criteria = if_else(type == "INCLUDE",paste(includeDetails.field,includeDetails.matchType,includeDetails.expressionValue),paste(excludeDetails.field,excludeDetails.matchType,excludeDetails.expressionValue))) %>% select(Filter = name, Type = type, Criteria)
}
```
# Data Integrity
## Goal Measurement {.tabset .tabset-fade .tabset-pills}
```{r message=FALSE,warning=FALSE}
if(!no_goals)
{
# Capture goal IDs to review
ga_goal_metric_names <- paste0("goal",goals_df$id,"Completions")
ga_goal_names <- goals_df$name
goal_lookup <- data.frame(list(Goal=ga_goal_names,Metric_Name=ga_goal_metric_names))
date_range_7 <- c(Sys.Date() - 8,Sys.Date() - 1)
date_range_30 <- c(Sys.Date() - 31,Sys.Date() - 1)
date_range_90 <- c(Sys.Date() - 91,Sys.Date() - 1)
get_goal_data <- function(date_range){
google_analytics(viewId = view_id, date_range = date_range, metrics = ga_goal_metric_names, dimensions = c("channelGrouping")) %>% select(-channelGrouping) %>% pivot_longer(cols=everything(),names_to="Goal",values_to = "Completions") %>% group_by(Goal) %>% summarise(Completions = sum(Completions)) %>% left_join(goal_lookup,by=c("Goal"="Metric_Name")) %>% select(Goal = Goal.y,Completions)
}
ga_goal_data_7 <- get_goal_data(date_range_7)
ga_goal_data_30 <- get_goal_data(date_range_30)
ga_goal_data_90 <- get_goal_data(date_range_90)
}
```
### Past 7 Days { `r if_else(no_goals,".hide","")`}
```{r}
if(!no_goals)
{
ga_goal_data_7
}
```
### Past 30 Days { `r if_else(no_goals,".hide","")`}
```{r}
if(!no_goals)
{
ga_goal_data_30
}
```
### Past 90 Days { `r if_else(no_goals,".hide","")`}
```{r}
if(!no_goals)
{
ga_goal_data_90
}
```
## Hostnames
```{r message=FALSE,warning=FALSE}
ga_hostnames <- google_analytics(viewId = view_id, date_range = c(Sys.Date()-31,Sys.Date()-1),metrics = "sessions",dimensions = c("hostname"),max=-1)
```
In many cases, GA properties and views are capturing data from the wrong domain or sub-domain. As a reference, the URL associated with __`r view_name`__ is `r view_url` and the URL associated with its property is `r property_url`. These URLs are configured in the GA admin and don't necessarily indicate which domains should be tracked.
Data has been retrieved for the last 30 days (`r paste0(Sys.Date()-31," to ",Sys.Date()-1)`). Overall, `r nrow(ga_hostnames)` unique hostnames were discovered.
```{r}
ga_hostnames
```
```{r message=FALSE,warning=FALSE}
ga_events <- google_analytics(viewId = view_id, date_range = c(Sys.Date()-31,Sys.Date()-1),metrics = "totalEvents",dimensions = c("eventCategory","eventAction","eventLabel"),max=-1)
```
## Events {.tabset .tabset-fade .tabset-pills `r if_else(nrow(ga_events) == 0,".hide","")`}
### Top Events
```{r}
ga_events %>% arrange(desc(totalEvents)) %>% select(Category = eventCategory, Action = eventAction, Label = eventLabel, Events = totalEvents) %>% head(100)
```
### Unique Event Categories
```{r}
ga_events %>% select(`Event Category` = eventCategory, Total = totalEvents) %>% group_by(`Event Category`) %>% summarise(Total = sum(Total)) %>% arrange(desc(Total))
```
### Unique Event Actions
```{r}
ga_events %>% select(`Event Action` = eventAction, Total = totalEvents) %>% group_by(`Event Action`) %>% summarise(Total = sum(Total)) %>% arrange(desc(Total))
```
### Unique Event Labels
```{r}
ga_events %>% select(`Event Label` = eventLabel, Total = totalEvents) %>% group_by(`Event Label`) %>% summarise(Total = sum(Total)) %>% arrange(desc(Total))
```
## Source / Medium / Campaign {.tabset .tabset-fade .tabset-pills}
### Sources
```{r}
ga_sources <- google_analytics(viewId=view_id,date_range=c(Sys.Date()-31,Sys.Date()-1),metrics="sessions",dimensions = c("source")) %>% arrange(desc(sessions))
ga_sources
```
### Mediums
```{r}
ga_mediums <- google_analytics(viewId=view_id,date_range=c(Sys.Date()-31,Sys.Date()-1),metrics="sessions",dimensions = c("medium")) %>% arrange(desc(sessions))
ga_mediums
```
### Campaigns
```{r}
ga_campaigns <- google_analytics(viewId=view_id,date_range=c(Sys.Date()-31,Sys.Date()-1),metrics="sessions",dimensions = c("campaign")) %>% arrange(desc(sessions))
ga_campaigns
```
# Trends
## All Traffic
```{r fig.width = 8, fig.asp = .3}
ga_all_traffic_current <- google_analytics(viewId=view_id,date_range=c(Sys.Date()-31,Sys.Date()-1),metrics="sessions",dimensions = c("date")) %>% rename(`Sessions - Last 30 Days`=sessions) %>% mutate(row = row_number())
ga_all_traffic_mom <- google_analytics(viewId=view_id,date_range=c(Sys.Date()-62,Sys.Date()-32),metrics="sessions",dimensions = c("date")) %>% rename(`Sessions - Previous Period`=sessions) %>% mutate(row = row_number())
ga_all_traffic_yoy <- google_analytics(viewId=view_id,date_range=c(Sys.Date()-365-30,Sys.Date()-365-1),metrics="sessions",dimensions = c("date")) %>% rename(`Sessions - Previous Year`=sessions) %>% mutate(row = row_number())
ga_all_traffic <- ga_all_traffic_current %>% left_join(ga_all_traffic_mom %>% select(-date)) %>% left_join(ga_all_traffic_yoy %>% select(-date)) %>% select(-row) %>% pivot_longer(cols=-date,names_to = "Period",values_to="Sessions")
ga_all_traffic_wide <- ga_all_traffic %>% pivot_wider(id_cols = c(date), names_from = Period, values_from = Sessions)
ga_sessions_last_30_xts <- xts(ga_all_traffic_wide %>% select(2), order.by = ga_all_traffic_wide$date)
ga_sessions_prev_period_xts <- xts(ga_all_traffic_wide %>% select(3), order.by = ga_all_traffic_wide$date)
ga_sessions_prev_year_xts <- xts(ga_all_traffic_wide %>% select(4), order.by = ga_all_traffic_wide$date)
ga_sessions_xts <- cbind(ga_sessions_last_30_xts,ga_sessions_prev_period_xts,ga_sessions_prev_year_xts)
dygraph(ga_sessions_xts) %>%
dyRangeSelector() %>% # Optionally, we can include an interactive date range selector
dyOptions(fillGraph = TRUE, fillAlpha = 0.15, gridLineColor = "#DDDDDD", axisLabelFontSize = 10) %>%
dySeries("Sessions...Last.30.Days", label = "Sessions", drawPoints = T, pointSize = 2, strokeWidth=2) %>%
dySeries("Sessions...Previous.Period", label = "Previous Period", drawPoints = T, pointSize = 2, strokeWidth=2) %>%
dySeries("Sessions...Previous.Year", label = "Previous Year", drawPoints = T, pointSize = 2, strokeWidth=2) %>%
dyAxis(name = "y", axisLineColor = "white", label = "Sessions") %>%
dyAxis(name = "x", drawGrid = F) %>%
dyLegend(show = "always", hideOnMouseOut =F, width = 600)
```
## Landing Pages
Showing top 100 landing pages by number of sessions.
```{r message=FALSE,warning=FALSE}
ga_landing_pages <- google_analytics(viewId=view_id, date_range=c(Sys.Date()-31,Sys.Date()-1),metrics=c("sessions","bounceRate","avgSessionDuration"), dimensions=c("landingPagePath"),max=100) %>% arrange(desc(sessions)) %>% rename(`Landing Page` = landingPagePath,`Bounce Rate` = bounceRate, `Avg Session Duration` = avgSessionDuration, Sessions = sessions)
ga_landing_pages
```
## Top Referral Sources
```{r message=FALSE,warning=FALSE}
filter_clause <- filter_clause_ga4(list(dim_filter(dimension = "channelGrouping", operator = "EXACT", expressions = "Referral")))
ga_referrers <- google_analytics(viewId=view_id, date_range=c(Sys.Date()-31,Sys.Date()-1),metrics=c("sessions","bounceRate","avgSessionDuration"), dimensions=c("sourceMedium"),dim_filters = filter_clause,max=100) %>% rename(`source / medium` = sourceMedium)
if(!is.null(ga_referrers))
{
ga_referrers %>% arrange(desc(sessions))
}
```
# Common Issues
## Hard-Coded Analytics Scripts
Looking for hard-coded analytics scripts in the HTML of the website pages associated with each property.
```{r message=FALSE}
lapply(property_df$websiteUrl,function(x){
read_html(x) %>% html_nodes("script") %>% xml2::xml_find_all(".//text()") %>% as.character() %>% enframe() %>% mutate(websiteUrl = x) %>% mutate(`gtm.js`=str_detect(value, "gtm.js")) %>% mutate(`analytics.js`=str_detect(value, "/analytics.js")) %>% mutate(`gtag.js`=str_detect(value, "gtag.js"))
}) %>% bind_rows() %>%
group_by(websiteUrl) %>% summarise(`gtm.js`=any(`gtm.js`),`analytics.js`=any(`analytics.js`),`gtag.js`=any(`gtag.js`)) %>% left_join(property_df %>% select(websiteUrl,id)) %>% select(`Property ID`=id, URL=websiteUrl,`gtm.js`,`analytics.js`,`gtag.js`)
```
## Email Address in Page Paths
```{r message=FALSE}
pii_clause <- filter_clause_ga4(list(dim_filter(dimension = "pagePath", operator = "REGEXP", expressions = "\\?.+email=")))
ga <- google_analytics(viewId=view_id, date_range=c(Sys.Date()-91,Sys.Date()-1),metrics=c("pageViews"), dimensions=c("pagePath"),dim_filters = pii_clause,max=-1)
if(is.null(ga))ga <- data.frame()
```
Found `r nrow(ga)` page paths with 'email' in the query string in the last 90 days.
```{r}
if(nrow(ga) > 0)
{
ga %>% select(pagePath)
}
```
# TODO List
- Campaign Trends
- Anomalies - Gaps in data, start/end date of data collection
- How to handle key pages or flows to track, like login?
- How to handle form completion?
- E-commerce considerations
- COMMON ISSUES section: self-referrals, 0% bounce rate, PII in URLs. GTM instaleld in correct place
- GTM Installation and container ID
- View Settings - exclude bots