library(wmf) library(ggplot2) library(data.table) library(uaparser) # Get data data <- wmf::mysql_read("SELECT timestamp, userAgent, wiki, event_searchSessionId AS session_id, event_subTest AS subtest FROM TestSearchSatisfaction2_14098806", "log") data <- as.data.table(data) data$terms_enabled <- ifelse(grepl(x=data$subtest, pattern = "enabled"), TRUE, FALSE) data$subtest <- gsub(x = data$subtest, pattern = "\\:(dis|en)abled$", replacement = "") # Check the sampling rate sampling_rate_session <- length(unique(data[!is.na(data$subtest),]$session_id))/length(unique(data$session_id)) sampling_rate_events <- nrow(data[!is.na(data$subtest),])/nrow(data) per_group_sampling_rate <- data[,j=list(event_count = .N), by = "subtest",] write.table(sampling_rate_events, file = "relaxer_sampling_rate.tsv", row.names = FALSE) # Check the user agents ua_data <- parse_agents(data$userAgent) ua_data <- as.data.frame(table(paste(ua_data$browser, ua_data$browser_major)), stringsAsFactors = FALSE) ua_data <- ua_data[order(ua_data$Freq, decreasing = TRUE),] write.table(ua_data[1:20,], file = "relaxer_ua_data.tsv", row.names = FALSE) # Check the projects project_data <- data[!is.na(data$subtest), j = list(events = .N), by = c("wiki", "subtest")] write.table(project_data, file = "relaxer_project_data.tsv", row.names = FALSE)