```{r setup, echo=FALSE, warning=FALSE, message=FALSE} knitr::opts_chunk$set( echo = F, collapse = TRUE, message = FALSE, warning = FALSE, out.width = "70%", fig.align = 'center', fig.width = 7, fig.asp = 0.618, # 1 / phi fig.show = "hold" ) library(assertthat) library(tidyverse) library(rvest) library(lubridate) library(stringr) library(rmarkdown) library(dygraphs) library(xts) # Stored by default in C:/Users/}/.R/gargle/gargle-oauth options( gargle_oauth_cache = "/config", googleAuthR.scopes.selected = "https://www.googleapis.com/auth/analytics.readonly" # Readonly scope is required for certain management API calls. See https://developers.google.com/analytics/devguides/config/mgmt/v3/authorization#OAuth2Authorizing ) library(googleAuthR) # Service Account Credentials # gar_auth_service(json_file=Sys.getenv("SERVICE_JSON")) library(googleAnalyticsR) # Alternatively, an OAuth client and email can be provided gar_set_client(json = Sys.getenv("CLIENT_JSON")) ga_auth(email=Sys.getenv("CLIENT_EMAIL")) theme_set(theme_minimal() + theme(text=element_text(size=16))) # Select an account ID and view ID to begin account_id <- Sys.getenv("ACCOUNT_ID") view_id <- Sys.getenv("VIEW_ID") ``` ```{r property and view data, message=FALSE, warning=FALSE} accounts <- ga_accounts() cur_account <- accounts %>% filter(id == account_id) property_df <- ga_webproperty_list(account_id) views_df <- bind_rows(lapply(property_df$id, function(x){ga_view_list(accountId = account_id, webPropertyId = x)})) %>% left_join(property_df,by=c("webPropertyId"="id")) # View-specific view_name <- (views_df %>% filter(id == view_id))$name.x view_url <- (views_df %>% filter(id == view_id))$websiteUrl.x property_url <- (views_df %>% filter(id == view_id))$websiteUrl.y property_name <- (views_df %>% filter(id == view_id))$name.y property_id <- (views_df %>% filter(id == view_id))$webPropertyId custom_dimensions_df <- ga_custom_vars_list(accountId = account_id, webPropertyId = property_id) goals_df <- ga_goal_list(webPropertyId = property_id, profileId = view_id, accountId=account_id) view_filters_df <- ga_filter_view_list(webPropertyId = property_id, viewId = view_id,accountId = account_id) if(nrow(view_filters_df) > 0){ view_filters_df <- view_filters_df %>% select(filterRef.id, profileRef.id, profileRef.name, rank) %>% left_join(ga_filter_list(account_id), by=c("filterRef.id" = "id")) } ``` # Account Structure ## Properties The account, __`r cur_account$name`__, has `r nrow(property_df)` properties. These are shown below sorted by update date in reverse chronological order. ```{r rows.print=5,cols.min.print=3} property_df %>% arrange(desc(updated)) %>% select(Property=name, URL=websiteUrl, `Updated Date`=updated) ``` ## Views ```{r} property_with_most_views <- views_df %>% group_by(`name.y`) %>% tally() %>% arrange(desc(n)) %>% ungroup() %>% filter(row_number() == 1) ``` In total, there are `r nrow(views_df)` views. These are shown below in reverse chronological order by update date. ```{r rows.print=5,cols.min.print=3} views_df %>% arrange(desc(updated.x)) %>% select(View=name.x, Property=name.y, `Update Date`=updated.x) ``` TODO - only need to show this if there is more than 1 property The property with the most views is __`r property_with_most_views$name.y`__ with `r property_with_most_views$n` views. The number of views per property is below. ```{r fig.height = 4} views_df %>% group_by(webPropertyId, name.y) %>% tally() %>% select(`Property ID`=webPropertyId,Name=name.y,`View Count`=n) %>% arrange(desc(`View Count`)) ``` # Configuration ## Custom Dimensions ```{r} no_custom_dimensions <- nrow(custom_dimensions_df) == 0 ```
The view, __`r view_name`__, has no associated custom dimensions
Property __`r property_name`__ has `r nrow(custom_dimensions_df)` custom dimensions shown below: ```{r} if(!no_custom_dimensions) { custom_dimensions_df %>% select(ID = index, Name = name, Scope = scope, `Is Active?`=active) } ```
```{r} # Check to see if there are any goals and change display of markdown accordingly no_goals <- nrow(goals_df) == 0 no_destination_goals <- F no_event_goals <- F if(!no_goals) { no_destination_goals <- nrow(goals_df %>% filter(type == "URL_DESTINATION")) == 0 no_event_goals <- nrow(goals_df %>% filter(type == "EVENT")) == 0 } ``` ## Goals {.tabset .tabset-fade .tabset-pills} ### All Goals (`r nrow(goals_df)`) View __`r view_name`__ has `r nrow(goals_df)` goals listed below. ```{r rows.print=5} if(!no_goals) { goals_df %>% select(Goal = name, Type = type, `Is Active?`=active) } ``` ### Destination Goals (`r if(nrow(goals_df) != 0){nrow(goals_df %>% filter(type == "URL_DESTINATION"))}else{ integer(1)}`) ```{r} if(!no_goals && !no_destination_goals) { goals_df %>% filter(type == "URL_DESTINATION") %>% select(Goal = name,URL = urlDestinationDetails.url) } ``` ### Event Goals (`r if(nrow(goals_df) != 0){nrow(goals_df %>% filter(type == "EVENT"))}else{integer(1)}`) ```{r} if(!no_goals && !no_event_goals) { goals_df %>% filter(type == "EVENT") %>% select(Goal = name,`Event Details` = eventDetails.eventConditions) } ``` ```{r} # Check to see if there are any goals and change display of markdown accordingly no_filters <- nrow(view_filters_df) == 0 no_include_exclude_filters <- T if(!no_filters) { no_include_exclude_filters <- nrow(view_filters_df %>% filter(type == "INCLUDE" | type == "EXCLUDE")) == 0 } ``` ## Filters {.tabset .tabset-fade .tabset-pills} ### All Filters (`r nrow(view_filters_df)`) View __`r view_name`__ has `r nrow(view_filters_df)` filters listed below. ```{r rows.print=7} if(!no_filters) { view_filters_df %>% select(Filter = name, Type = type) } ``` ### Include Filters (`r if(!no_filters){nrow(view_filters_df %>% filter(type == "INCLUDE"))}else{integer(1)}`) Include/Exclude filters are called out below along with their criteria. ```{r rows.print=7} if(!no_filters && nrow(view_filters_df %>% filter(type == "INCLUDE")) > 0) { view_filters_df %>% filter(type=="INCLUDE") %>% mutate(Criteria = if_else(type == "INCLUDE",paste(includeDetails.field,includeDetails.matchType,includeDetails.expressionValue),paste(excludeDetails.field,excludeDetails.matchType,excludeDetails.expressionValue))) %>% select(Filter = name, Type = type, Criteria) } ``` ### Exclude Filters (`r if(!no_filters){nrow(view_filters_df %>% filter(type == "EXCLUDE"))}else{integer(1)}`) Include/Exclude filters are called out below along with their criteria. ```{r rows.print=7} if(!no_filters && nrow(view_filters_df %>% filter(type == "EXCLUDE")) > 0) { view_filters_df %>% filter(type=="EXCLUDE") %>% mutate(Criteria = if_else(type == "INCLUDE",paste(includeDetails.field,includeDetails.matchType,includeDetails.expressionValue),paste(excludeDetails.field,excludeDetails.matchType,excludeDetails.expressionValue))) %>% select(Filter = name, Type = type, Criteria) } ``` # Data Integrity ## Goal Measurement {.tabset .tabset-fade .tabset-pills} ```{r message=FALSE,warning=FALSE} if(!no_goals) { # Capture goal IDs to review ga_goal_metric_names <- paste0("goal",goals_df$id,"Completions") ga_goal_names <- goals_df$name goal_lookup <- data.frame(list(Goal=ga_goal_names,Metric_Name=ga_goal_metric_names)) date_range_7 <- c(Sys.Date() - 8,Sys.Date() - 1) date_range_30 <- c(Sys.Date() - 31,Sys.Date() - 1) date_range_90 <- c(Sys.Date() - 91,Sys.Date() - 1) get_goal_data <- function(date_range){ google_analytics(viewId = view_id, date_range = date_range, metrics = ga_goal_metric_names, dimensions = c("channelGrouping")) %>% select(-channelGrouping) %>% pivot_longer(cols=everything(),names_to="Goal",values_to = "Completions") %>% group_by(Goal) %>% summarise(Completions = sum(Completions)) %>% left_join(goal_lookup,by=c("Goal"="Metric_Name")) %>% select(Goal = Goal.y,Completions) } ga_goal_data_7 <- get_goal_data(date_range_7) ga_goal_data_30 <- get_goal_data(date_range_30) ga_goal_data_90 <- get_goal_data(date_range_90) } ``` ### Past 7 Days { `r if_else(no_goals,".hide","")`} ```{r} if(!no_goals) { ga_goal_data_7 } ``` ### Past 30 Days { `r if_else(no_goals,".hide","")`} ```{r} if(!no_goals) { ga_goal_data_30 } ``` ### Past 90 Days { `r if_else(no_goals,".hide","")`} ```{r} if(!no_goals) { ga_goal_data_90 } ``` ## Hostnames ```{r message=FALSE,warning=FALSE} ga_hostnames <- google_analytics(viewId = view_id, date_range = c(Sys.Date()-31,Sys.Date()-1),metrics = "sessions",dimensions = c("hostname"),max=-1) ``` In many cases, GA properties and views are capturing data from the wrong domain or sub-domain. As a reference, the URL associated with __`r view_name`__ is `r view_url` and the URL associated with its property is `r property_url`. These URLs are configured in the GA admin and don't necessarily indicate which domains should be tracked. Data has been retrieved for the last 30 days (`r paste0(Sys.Date()-31," to ",Sys.Date()-1)`). Overall, `r nrow(ga_hostnames)` unique hostnames were discovered. ```{r} ga_hostnames ``` ```{r message=FALSE,warning=FALSE} ga_events <- google_analytics(viewId = view_id, date_range = c(Sys.Date()-31,Sys.Date()-1),metrics = "totalEvents",dimensions = c("eventCategory","eventAction","eventLabel"),max=-1) ``` ## Events {.tabset .tabset-fade .tabset-pills `r if_else(nrow(ga_events) == 0,".hide","")`} ### Top Events ```{r} ga_events %>% arrange(desc(totalEvents)) %>% select(Category = eventCategory, Action = eventAction, Label = eventLabel, Events = totalEvents) %>% head(100) ``` ### Unique Event Categories ```{r} ga_events %>% select(`Event Category` = eventCategory, Total = totalEvents) %>% group_by(`Event Category`) %>% summarise(Total = sum(Total)) %>% arrange(desc(Total)) ``` ### Unique Event Actions ```{r} ga_events %>% select(`Event Action` = eventAction, Total = totalEvents) %>% group_by(`Event Action`) %>% summarise(Total = sum(Total)) %>% arrange(desc(Total)) ``` ### Unique Event Labels ```{r} ga_events %>% select(`Event Label` = eventLabel, Total = totalEvents) %>% group_by(`Event Label`) %>% summarise(Total = sum(Total)) %>% arrange(desc(Total)) ``` ## Source / Medium / Campaign {.tabset .tabset-fade .tabset-pills} ### Sources ```{r} ga_sources <- google_analytics(viewId=view_id,date_range=c(Sys.Date()-31,Sys.Date()-1),metrics="sessions",dimensions = c("source")) %>% arrange(desc(sessions)) ga_sources ``` ### Mediums ```{r} ga_mediums <- google_analytics(viewId=view_id,date_range=c(Sys.Date()-31,Sys.Date()-1),metrics="sessions",dimensions = c("medium")) %>% arrange(desc(sessions)) ga_mediums ``` ### Campaigns ```{r} ga_campaigns <- google_analytics(viewId=view_id,date_range=c(Sys.Date()-31,Sys.Date()-1),metrics="sessions",dimensions = c("campaign")) %>% arrange(desc(sessions)) ga_campaigns ``` # Trends ## All Traffic ```{r fig.width = 8, fig.asp = .3} ga_all_traffic_current <- google_analytics(viewId=view_id,date_range=c(Sys.Date()-31,Sys.Date()-1),metrics="sessions",dimensions = c("date")) %>% rename(`Sessions - Last 30 Days`=sessions) %>% mutate(row = row_number()) ga_all_traffic_mom <- google_analytics(viewId=view_id,date_range=c(Sys.Date()-62,Sys.Date()-32),metrics="sessions",dimensions = c("date")) %>% rename(`Sessions - Previous Period`=sessions) %>% mutate(row = row_number()) ga_all_traffic_yoy <- google_analytics(viewId=view_id,date_range=c(Sys.Date()-365-30,Sys.Date()-365-1),metrics="sessions",dimensions = c("date")) %>% rename(`Sessions - Previous Year`=sessions) %>% mutate(row = row_number()) ga_all_traffic <- ga_all_traffic_current %>% left_join(ga_all_traffic_mom %>% select(-date)) %>% left_join(ga_all_traffic_yoy %>% select(-date)) %>% select(-row) %>% pivot_longer(cols=-date,names_to = "Period",values_to="Sessions") ga_all_traffic_wide <- ga_all_traffic %>% pivot_wider(id_cols = c(date), names_from = Period, values_from = Sessions) ga_sessions_last_30_xts <- xts(ga_all_traffic_wide %>% select(2), order.by = ga_all_traffic_wide$date) ga_sessions_prev_period_xts <- xts(ga_all_traffic_wide %>% select(3), order.by = ga_all_traffic_wide$date) ga_sessions_prev_year_xts <- xts(ga_all_traffic_wide %>% select(4), order.by = ga_all_traffic_wide$date) ga_sessions_xts <- cbind(ga_sessions_last_30_xts,ga_sessions_prev_period_xts,ga_sessions_prev_year_xts) dygraph(ga_sessions_xts) %>% dyRangeSelector() %>% # Optionally, we can include an interactive date range selector dyOptions(fillGraph = TRUE, fillAlpha = 0.15, gridLineColor = "#DDDDDD", axisLabelFontSize = 10) %>% dySeries("Sessions...Last.30.Days", label = "Sessions", drawPoints = T, pointSize = 2, strokeWidth=2) %>% dySeries("Sessions...Previous.Period", label = "Previous Period", drawPoints = T, pointSize = 2, strokeWidth=2) %>% dySeries("Sessions...Previous.Year", label = "Previous Year", drawPoints = T, pointSize = 2, strokeWidth=2) %>% dyAxis(name = "y", axisLineColor = "white", label = "Sessions") %>% dyAxis(name = "x", drawGrid = F) %>% dyLegend(show = "always", hideOnMouseOut =F, width = 600) ``` ## Landing Pages Showing top 100 landing pages by number of sessions. ```{r message=FALSE,warning=FALSE} ga_landing_pages <- google_analytics(viewId=view_id, date_range=c(Sys.Date()-31,Sys.Date()-1),metrics=c("sessions","bounceRate","avgSessionDuration"), dimensions=c("landingPagePath"),max=100) %>% arrange(desc(sessions)) %>% rename(`Landing Page` = landingPagePath,`Bounce Rate` = bounceRate, `Avg Session Duration` = avgSessionDuration, Sessions = sessions) ga_landing_pages ``` ## Top Referral Sources ```{r message=FALSE,warning=FALSE} filter_clause <- filter_clause_ga4(list(dim_filter(dimension = "channelGrouping", operator = "EXACT", expressions = "Referral"))) ga_referrers <- google_analytics(viewId=view_id, date_range=c(Sys.Date()-31,Sys.Date()-1),metrics=c("sessions","bounceRate","avgSessionDuration"), dimensions=c("sourceMedium"),dim_filters = filter_clause,max=100) %>% rename(`source / medium` = sourceMedium) if(!is.null(ga_referrers)) { ga_referrers %>% arrange(desc(sessions)) } ``` # Common Issues ## Hard-Coded Analytics Scripts Looking for hard-coded analytics scripts in the HTML of the website pages associated with each property. ```{r message=FALSE} lapply(property_df$websiteUrl,function(x){ read_html(x) %>% html_nodes("script") %>% xml2::xml_find_all(".//text()") %>% as.character() %>% enframe() %>% mutate(websiteUrl = x) %>% mutate(`gtm.js`=str_detect(value, "gtm.js")) %>% mutate(`analytics.js`=str_detect(value, "/analytics.js")) %>% mutate(`gtag.js`=str_detect(value, "gtag.js")) }) %>% bind_rows() %>% group_by(websiteUrl) %>% summarise(`gtm.js`=any(`gtm.js`),`analytics.js`=any(`analytics.js`),`gtag.js`=any(`gtag.js`)) %>% left_join(property_df %>% select(websiteUrl,id)) %>% select(`Property ID`=id, URL=websiteUrl,`gtm.js`,`analytics.js`,`gtag.js`) ``` ## Email Address in Page Paths ```{r message=FALSE} pii_clause <- filter_clause_ga4(list(dim_filter(dimension = "pagePath", operator = "REGEXP", expressions = "\\?.+email="))) ga <- google_analytics(viewId=view_id, date_range=c(Sys.Date()-91,Sys.Date()-1),metrics=c("pageViews"), dimensions=c("pagePath"),dim_filters = pii_clause,max=-1) if(is.null(ga))ga <- data.frame() ``` Found `r nrow(ga)` page paths with 'email' in the query string in the last 90 days. ```{r} if(nrow(ga) > 0) { ga %>% select(pagePath) } ``` # TODO List - Campaign Trends - Anomalies - Gaps in data, start/end date of data collection - How to handle key pages or flows to track, like login? - How to handle form completion? - E-commerce considerations - COMMON ISSUES section: self-referrals, 0% bounce rate, PII in URLs. GTM instaleld in correct place - GTM Installation and container ID - View Settings - exclude bots