library(ggplot2) library(wmf) library(ortiz) library(plyr) library(scales) # Read in the data, limiting it to non-checkins past the point at which session IDs became # reliable. Reformat the timestamps and generate dates. data <- mysql_read("SELECT event_searchSessionId, timestamp FROM TestSearchSatisfaction2_13223897 WHERE event_action IN('searchResultPage','visitPage') AND timestamp >= '20150914180000'","log") data$timestamp <- wmf::from_mediawiki(data$timestamp) data$date <- as.Date(data$timestamp) # What thresholds do we want? 10, 20, 30, 60, 90, 100 and 120 seconds, let's go with. thresholds <- c(1,10,20,30,60,90,100,120) # A simple function for generating a value for each day, given a threshold and dataset generate_outcomes <- function(threshold, dataset){ dataset <- split(dataset, dataset$date) results <- lapply(dataset, function(x, threshold){ dwell_outcome <- dwell_time(x, ids = "event_searchSessionId", timestamps = "timestamp", dwell_threshold = threshold) return(data.frame(date = unique(x$date), threshold = threshold, outcome = sum(dwell_outcome)/length(dwell_outcome))) }, threshold = threshold) return(do.call("rbind", results)) } # Run that over our existing thresholds generated_outcomes <- do.call("rbind", lapply(thresholds, generate_outcomes, dataset = data)) # Generate averages for each threshold thresh_avgs <- ddply(generated_outcomes, "threshold", function(x){mean(x$outcome)}) # Plot them ggsave(file = "average_success_per_threshold.png", plot = ggplot(thresh_avgs, aes(threshold, V1)) + geom_line(colour = "#00BFC4", size = 1.5) + geom_point(size=3) + theme_fivethirtynine() + scale_x_continuous(limits = c(0, 120)) + scale_y_continuous(labels = percent, limits = c(0,0.25)) + labs(title = "User Satisfaction using Arbitrary Thresholds, by Threshold", x = "Threshold", y = "Satisfaction Rate (%)")) # Plot the non-averaged data, by time ggsave(file = "threshold_success_by_day.png", plot = ggplot(generated_outcomes, aes(date, outcome, group = factor(threshold), colour = factor(threshold), type = factor(threshold))) + geom_line(size = 1.5) + geom_point(size=3) + theme_fivethirtynine() + scale_x_date(breaks = "day") + labs(title = "User Satisfaction using Arbitrary Thresholds, by Threshold/Day", x = "Date", y = "Satisfaction Rate (%)", colour = "Threshold (seconds)")) # Save the daily threshold successes write.table(generated_outcomes, file = "threshold_results.tsv", sep = "\t", row.names = FALSE)