library(webtools) library(data.table) library(pageviews) library(parallel) library(lubridate) library(rgeoip) files <- get_files() per_file_function <- function(filename){ data <- to_pageviews(read_sampled_log(filename)) data$timestamp <- convert_timestamps(data$timestamp) data <- data[!is.na(data$timestamp),] minute(data$timestamp) <- 0 second(data$timestamp) <- 0 data$timestamp <- as.character(data$timestamp) ips <- sanitise_ips(data$ip_address, data$x_forwarded) data <- cbind(data, geolookup(ips, "/usr/local/share/GeoIP/GeoIP2-City.mmdb", c("country_name","country_iso"))) data <- cbind(data, geolookup(ips, "/usr/local/share/GeoIP/GeoIP2-Connection-Type.mmdb", c("connection_type"))) data$referer <- extract_referers(iconv(data$referer, to = "UTF-8")) data$project <- extract_project(data$url) data$is_spider <- is_spider(data$user_agent) data$is_zero <- grepl(x = data$x_analytics, pattern = "zero", fixed = TRUE) data$is_automata <- is_automata(data$user_agent) data$access_method <- identify_access_method(data$url) data <- data.table(data) data <- data[,j=list(pageviews = .N, country_iso = country_iso[1]), by = c("timestamp","country_name","referer","project", "access_method","is_spider","is_automata","is_zero")] cat(".") return(data) } results <- do.call("rbind",mclapply(files[1:50], per_file_function, mc.cores=4)) results$project <- gsub(x = results$project, pattern = "\\.org.*", replacement = "") results <- results[j = list(pageviews = sum(pageviews)*1000),by = c("timestamp","country_name", "country_iso", "referer","project", "access_method","is_spider","is_automata","is_zero")] mysql_write(results, "pageviews05", "staging") results$timestamp <- as.Date(as.POSIXlt(results$timestamp)) day(results$timestamp) <- 1 results <- results[j = list(pageviews = sum(pageviews)*1000),by = c("timestamp","country_name", "country_iso", "referer","project", "access_method","is_spider","is_automata","is_zero")] mysql_write(results, "pentahoviews05", "staging")