Feedback should be send to goran.milovanovic_ext@wikimedia.de
.
0. New Editor Defition
- Having made more than 10 edits, of course,
- but also depending upon the following constraints defined in respece to the
wmf.mediawiki_history
Hive table (conjunction): – event_type = 'create'
– event_user_is_created_by_self = true
– event_user_is_bot_by_name = false
– page_namespace = 0
– page_is_redirect_latest = false
– !(event_user_id = 0)
The code chunk in 1. Data Acquistion
encompasses the HiveQL
query used to fetch the data from wmf.mediawiki_history
. At this point, this Hive table does not encompass the historical page_is_redirect
, introducing the most severe problem in the current implementation of this Report. It is expected that this field will become available during Q4/2017
T161146
1. Data Acquistion
The Data Acquisition code chunk is not reproducible from this report. It is run as an newEds_mediawiki_history.R
script in production (currently stat1005
). The userIds
are anonymized and the dataset is copied mannually from production and processed locally to produce this Report.
### --- Script: newEds_mediawiki_history.R
### --- the following runs on stat1005.eqiad.wmnet
### --- Rscript /home/goransm/_miscWMDE/newEditors_MediaWikiHistory/newEds_mediawiki_history.R
### --- The script collects (a) userIds, and (b) dates (yyyymmdd) on which a particular user
### --- has reached >= 10 edits on a given project.
### --- The datasets are used for the New Editors Report on `dewiki`
### --- Goran S. Milovanovic, Data Analyst, WMDE
### --- October 16, 2017.
rm(list = ls())
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)
### --- Define snapshot for wmf.mediawiki_history
snapshot <- as.character(Sys.time())
snapshotY <- strsplit(snapshot, split = "-")[[1]][1]
snapshotM <- strsplit(snapshot, split = "-")[[1]][2]
snapshot <- paste(snapshotY, snapshotM, sep = "-")
### --- NOTE DECEMBER SNAPSHOT NOT READY (comment: 01/04/2018)
snapshot <- '2017-11'
### --- END define snapshot
### --- projects list
projects <- c('dewiki', 'enwiki', 'frwiki')
### --- dir struct:
baseDir <- '/srv/home/goransm/_miscWMDE/newEditors_MediaWikiHistory'
outDir <- paste(baseDir, '/_results/', sep = "")
scriptDir <- paste(baseDir, '/_script/', sep = "")
setwd(scriptDir)
### --- run HiveQL scripts
for (i in 1:length(projects)) {
hiveQL <- paste("SELECT event_user_id, SUBSTR(from_utc_timestamp(event_timestamp, 'CET'), 1, 10)
FROM (
SELECT *,
row_number() OVER (partition by event_user_id ORDER by event_timestamp) rownum
FROM wmf.mediawiki_history WHERE wiki_db = '", projects[i],
"' AND event_entity = 'revision'
AND event_type = 'create'
AND event_user_is_created_by_self = true
AND event_user_is_bot_by_name = false
AND page_namespace = 0
AND page_is_redirect = false
AND !(event_user_id = 0)
AND snapshot = '", snapshot,
"') tab1
WHERE rownum = 10;",
sep = "")
# - write hql
write(hiveQL, 'newEds10.hql')
### --- output filename
filename <- paste('newUsers10_', projects[i],".tsv", sep = "")
filename <- paste(outDir, filename, sep = "")
### --- execute hql script:
hiveArgs <- 'beeline -f'
hiveInput <- paste(paste(scriptDir, 'newEds10.hql', sep = ""),
" > ",
filename,
sep = "")
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
system(command = hiveCommand, wait = TRUE)
}
### --- END run HiveQL scripts
### --- anonymize, wrangle, and save
setwd(outDir)
Sys.setlocale("LC_TIME", "C")
lF <- list.files()
lF <- lF[grepl("tsv", lF, fixed = T)]
for (i in 1:length(lF)) {
project <- strsplit(
strsplit(lF[i], split = "_", fixed = T)[[1]][2],
split = ".", fixed = T)[[1]][1]
dataSet <- readLines(lF[i])
dataSet <- dataSet[16:(length(dataSet) - 2)]
User <- sapply(dataSet, function(x) {
strsplit(x, split = "\t", fixed = T)[[1]][1]
})
Date <- sapply(dataSet, function(x) {
strsplit(x, split = "\t", fixed = T)[[1]][2]
})
dsFrame <- data.frame(User = User,
Date = Date,
stringsAsFactors = F,
row.names = seq(1, length(User), by = 1))
rm(Date); rm(User); rm(dataSet); gc()
dsFrame$Date <- as.Date(dsFrame$Date)
dsFrame <- dsFrame %>%
arrange(Date)
# - anonymize user Ids
dsFrame$User <- paste("u_", seq(1, length(dsFrame$User), by = 1))
# - produce dataset w. daily resoluton
filename <- paste("NewUsersDaily_", project, ".csv", sep = "")
dailyRes <- dsFrame %>%
group_by(Date) %>%
summarise(Count = n())
dailyRes$Month <- sapply(months(dailyRes$Date), function(x) {
which(month.name %in% x)
})
dailyRes$Year <- year(dailyRes$Date)
dailyRes$Week <- week(dailyRes$Date)
dailyRes$DayWeek <- weekdays(dailyRes$Date)
write.csv(dailyRes, filename)
}
2. Weekly Trends (Big Picture)
### --- Define projects under consideration:
# - report: current update
print(paste0("Current update: ", as.character(Sys.time())))
[1] "Current update: 2018-01-08 20:09:12"
projects <- c('dewiki', 'enwiki', 'frwiki')
lF <- list.files('./_results/')
lF <- lF[grepl(".csv", lF, fixed = T)]
dwSets <- list()
### --- Recent weekly trends:
for (i in 1:length(projects)) {
w <- which(grepl(projects[i], lF, fixed = T))
dataSet <- read.csv(paste('./_results/', lF[w], sep = ""),
row.names = 1,
check.names = F,
header = T,
stringsAsFactors = F)
dWeekly <- dataSet
dWeekly$Month <- sapply(dWeekly$Month, function(x) {
if (!(nchar(x) == 2)) {
return(paste("0", x, sep = ""))
} else {
return(x)
}
})
dWeekly$Week <- sapply(dWeekly$Week, function(x) {
if (!(nchar(x) == 2)) {
return(paste("0", x, sep = ""))
} else {
return(x)
}
})
dWeekly$YW <- paste(dWeekly$Year, dWeekly$Week, sep = "-")
dWeekly <- dWeekly %>%
group_by(YW) %>%
summarise(Count = sum(Count)) %>%
arrange(YW)
dwSets[[i]] <- dWeekly
rm(dWeekly); rm(dataSet); gc()
}
### --- produce plotSet
dweeks <- unlist(lapply(dwSets, function(x) {
x$YW
}))
dMat <- as.data.frame(matrix('', nrow = length(dweeks), ncol = length(projects) + 1),
stringsAsFactors = F)
colnames(dMat)[1] <- 'Week'
colnames(dMat)[2:dim(dMat)[2]] <- projects
dMat[, 1] <- dweeks
for (i in 1:length(dwSets)) {
w <- which(dMat$Week %in% dwSets[[i]]$YW)
dMat[, i+1][w] <- dwSets[[i]]$Count
}
dMat <- dMat %>%
gather(key = Project,
value = Count,
projects)
dMat$Count <- as.numeric(dMat$Count)
# - x-axis labels
dMat$XLabs <- sapply(dMat$Week, function(x) {
if (grepl("-01$", x)) {
return(strsplit(x, split = "-", fixed = T)[[1]][1])
} else {
return("")
}
})
# - Visualize w. {ggplot2}
ggplot(dMat, aes(x = Week,
y = Count,
group = Project,
color = Project,
fill = Project)) +
geom_line(size = .25) +
scale_y_continuous(labels = comma) +
scale_x_discrete(labels = dMat$XLabs) +
xlab("Year (weekly data resolution)") +
ylab("New editors") +
ggtitle('New Editors (>= 10 edits) Income:\nWeekly comparison (starting at: Week 16. of 2016.)') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank())
Log-scale:
# - Visualize w. {ggplot2}
ggplot(dMat, aes(x = Week,
y = log(Count),
group = Project,
color = Project,
fill = Project)) +
geom_line(size = .25) +
scale_y_continuous(labels = comma) +
scale_x_discrete(labels = dMat$XLabs) +
xlab("Year (weekly data resolution)") +
ylab("log(New editors)") +
ggtitle('New Editors (>= 10 edits) Income:\nWeekly comparison (starting at: Week 16. of 2016.)') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank())
3. Monthly Trends: The Previous Six Months
### --- Define projects under consideration/get files
lF <- list.files('./_results/')
lF <- lF[grepl(".csv", lF, fixed = T)]
dwSets <- list()
### --- Monthly trends:
for (i in 1:length(lF)) {
project <- strsplit(
strsplit(lF[i], split = "_", fixed = T)[[1]][2],
split = ".", fixed = T)[[1]][1]
dS <- read.csv(paste('./_results/', lF[i], sep = ""),
header = T,
check.names = F,
stringsAsFactors = F,
row.names = 1)
dS$YearMonth <- paste(dS$Year,
ifelse(nchar(dS$Month) == 2,
dS$Month,
paste("0", dS$Month, sep = "")
),
sep = "-")
dS <- dplyr::select(dS, YearMonth, Count)
dS$Project <- project
dwSets[[i]] <- dS
rm(dS)
}
dwSets <- rbindlist(dwSets)
dwSets <- dwSets %>%
group_by(Project, YearMonth) %>%
summarise(Editors = sum(Count)) %>%
arrange(Project, YearMonth)
### --- determine: last 6 months
actualDate <- as.character(Sys.time())
actualDateY <- as.numeric(strsplit(actualDate, split = "-")[[1]][1])
actualDateM <- as.numeric(strsplit(actualDate, split = "-")[[1]][2])
actualDateM <- actualDateM - 1
actualDateM1 <- actualDateM - 5
monthsSeq <- actualDateM1:actualDateM
yearsSeq <- rep(actualDateY, length(monthsSeq))
yearsSeq[which(monthsSeq <= 0)] <- actualDateY - 1
monthsSeq[which(monthsSeq<= 0)] <- 12 - abs(monthsSeq[which(monthsSeq<= 0)])
targetYM <- paste(yearsSeq,
ifelse(nchar(monthsSeq) == 2,
monthsSeq,
paste("0", monthsSeq, sep = "")
),
sep = "-")
### --- filter: last 6 months
dwSets <- dwSets %>%
filter(YearMonth %in% targetYM)
### --- visualize w. {ggplot2}
colnames(dwSets)[2] <- 'Month'
dwSets$Month <- factor(dwSets$Month, levels = sort(unique(dwSets$Month)))
ggplot(dwSets,
aes(x = Month, y = Editors, label = Editors)) +
geom_line(size = .25, color = "#4c8cff", group = 1) +
geom_point(size = 1.5, color = "#4c8cff") +
geom_point(size = 1, color = "white") +
geom_text_repel(data = dwSets,
aes(x = Month, y = Editors, label = Editors),
size = 3) +
facet_wrap(~ Project, ncol = 3, scales = "free_y") +
xlab('Month') + ylab('New Editors (>= 10 edits)') +
scale_y_continuous(labels = comma) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 10, hjust = 1)) +
theme(axis.title.x = element_text(size = 12)) +
theme(axis.title.y = element_text(size = 12)) +
theme(plot.title = element_text(size = 15))
3. Monthly Trends: The Previous Two Years
### --- Define projects under consideration/get files
lF <- list.files('./_results/')
lF <- lF[grepl(".csv", lF, fixed = T)]
dwSets <- list()
### --- Monthly trends:
for (i in 1:length(lF)) {
project <- strsplit(
strsplit(lF[i], split = "_", fixed = T)[[1]][2],
split = ".", fixed = T)[[1]][1]
dS <- read.csv(paste('./_results/', lF[i], sep = ""),
header = T,
check.names = F,
stringsAsFactors = F,
row.names = 1)
dS$YearMonth <- paste(dS$Year,
ifelse(nchar(dS$Month) == 2,
dS$Month,
paste("0", dS$Month, sep = "")
),
sep = "-")
dS <- dplyr::select(dS, YearMonth, Count)
dS$Project <- project
dwSets[[i]] <- dS
rm(dS)
}
dwSets <- rbindlist(dwSets)
dwSets <- dwSets %>%
group_by(Project, YearMonth) %>%
summarise(Editors = sum(Count)) %>%
arrange(Project, YearMonth)
### --- determine: last 2 years
actualDate <- as.character(Sys.time())
actualDateY <- as.numeric(strsplit(actualDate, split = "-")[[1]][1])
actualDateM <- as.numeric(strsplit(actualDate, split = "-")[[1]][2])
actualDateM <- actualDateM - 1
actualDateM1 <- actualDateM - 12
monthsSeq <- actualDateM1:actualDateM
yearsSeq <- rep(actualDateY, length(monthsSeq))
yearsSeq[which(monthsSeq <= 0)] <- actualDateY - 1
monthsSeq[which(monthsSeq <= 0)] <- 12 - abs(monthsSeq[which(monthsSeq<= 0)])
targetYM <- paste(yearsSeq,
ifelse(nchar(monthsSeq) == 2,
monthsSeq,
paste("0", monthsSeq, sep = "")
),
sep = "-")
targetYM1 <- lapply(targetYM, function(x) {
x <- strsplit(x, split = "-")[[1]]
x[1] <- as.numeric(x[1]) - 1
x <- paste(x, collapse = "-")
return(x)
})
targetYM <- unique(c(targetYM, unlist(targetYM1)))
### --- filter: last 6 months
dwSets <- dwSets %>%
filter(YearMonth %in% targetYM)
### --- visualize w. {ggplot2}
colnames(dwSets)[2] <- 'Month'
dwSets$Month <- factor(dwSets$Month, levels = sort(unique(dwSets$Month)))
ggplot(dwSets,
aes(x = Month, y = Editors, label = Editors)) +
geom_line(size = .25, color = "#4c8cff", group = 1) +
geom_point(size = 1.5, color = "#4c8cff") +
geom_point(size = 1, color = "white") +
geom_text_repel(data = dwSets,
aes(x = Month, y = Editors, label = Editors),
size = 3) +
facet_wrap(~ Project, ncol = 1, scales = "free_y") +
xlab('Month') + ylab('New Editors (>= 10 edits)') +
scale_y_continuous(labels = comma) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
theme(axis.title.x = element_text(size = 12)) +
theme(axis.title.y = element_text(size = 12)) +
theme(plot.title = element_text(size = 15))
4. dewiki Forecast
The optimal ARIMA forecast for dewiki
, starting from the first year with a complete monthly dataset (2007):
### --- Data
dataSet <- read.csv(paste(getwd(), '/_results/NewUsersDaily_dewiki.csv', sep = ''),
header = T,
check.names = F,
stringsAsFactors = F,
row.names = 1)
### --- Wrangle
dataSet$Month <- sapply(dataSet$Month, function(x) {
if(nchar(x) == 1) {
x <- paste("0", x, sep = "")
}
x
})
dataSet <- arrange(dataSet, Year, Month)
# - complete data since 2007:
completeYears <- 2007:2017
wComplete <- rowSums(sapply(completeYears, function(x) {
grepl(x, dataSet$Year)
}))
dataSet <- dataSet[as.logical(wComplete), ]
# - summarise per month and drop incomplete data for the last month:
dataSet <- dataSet %>%
select(Year, Month, Count) %>%
group_by(Year, Month) %>%
summarise(Edits = sum(Count))
### --- as time series object:
timeEds <- ts(dataSet$Edits,
start = c(2007, 1),
end = c(2017, as.numeric(dataSet$Month[dim(dataSet)[1]])),
frequency = 12)
### --- ARIMA model
timeEdsARIMA <- auto.arima(timeEds, D = 1, seasonal = T)
timeEdsARIMA
Series: timeEds
ARIMA(0,1,1)(1,1,1)[12]
Coefficients:
ma1 sar1 sma1
-0.5099 -0.3550 -0.5699
s.e. 0.0797 0.1271 0.1223
sigma^2 estimated as 3090: log likelihood=-645.56
AIC=1299.12 AICc=1299.48 BIC=1310.21
plot(forecast(timeEdsARIMA))
---
title: "New Editors on de.wikipedia.org"
author: "Goran S. Milovanovic, Data Scientist, WMDE"
date: "January 08, 2017"
output:
  html_notebook:
    code_folding: hide
    theme: simplex
    toc: yes
    toc_depth: 2
    toc_float: yes
  html_document:
    toc: yes
    toc_depth: 2
---

**Feedback** should be send to `goran.milovanovic_ext@wikimedia.de`. 

```{r, echo = F, warning = F, message = F, results = 'hide'}
# !diagnostics off
### --- Setup
knitr::opts_chunk$set(fig.width = 15, fig.height = 8) 
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggrepel)
library(scales)
library(kableExtra)
library(rmarkdown)
library(knitr)
library(tseries)
library(forecast)
library(data.table)
```

## 0. New Editor Defition

- Having made more than 10 edits, of course, 
- but also depending upon the following constraints defined in respece to the
- `wmf.mediawiki_history` Hive table (conjunction):
-- `event_type = 'create'`
-- `event_user_is_created_by_self = true`
-- `event_user_is_bot_by_name = false`
-- `page_namespace = 0`
-- `page_is_redirect_latest = false`
-- `!(event_user_id = 0)`

The code chunk in `1. Data Acquistion` encompasses the `HiveQL` query used to fetch the data from `wmf.mediawiki_history`. At this point, this Hive table does not encompass the historical `page_is_redirect`, introducing the most severe problem in the current implementation of this Report. It is expected that this field will become available during `Q4/2017` [T161146](https://phabricator.wikimedia.org/T161146)

## 1. Data Acquistion

The Data Acquisition code chunk is not reproducible from this report. It is run as an `newEds_mediawiki_history.R` script in production (currently `stat1005`). The `userIds` are anonymized and the dataset is copied mannually from production and processed locally to produce this Report.

```{r, echo = T, eval = F}
### --- Script: newEds_mediawiki_history.R
### --- the following runs on stat1005.eqiad.wmnet
### --- Rscript /home/goransm/_miscWMDE/newEditors_MediaWikiHistory/newEds_mediawiki_history.R

### --- The script collects (a) userIds, and (b) dates (yyyymmdd) on which a particular user
### --- has reached >= 10 edits on a given project.
### --- The datasets are used for the New Editors Report on `dewiki`

### --- Goran S. Milovanovic, Data Analyst, WMDE
### --- October 16, 2017.

rm(list = ls())
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)

### --- Define snapshot for wmf.mediawiki_history
snapshot <- as.character(Sys.time())
snapshotY <- strsplit(snapshot, split = "-")[[1]][1]
snapshotM <- strsplit(snapshot, split = "-")[[1]][2]
snapshot <- paste(snapshotY, snapshotM, sep = "-")
### --- NOTE DECEMBER SNAPSHOT NOT READY (comment: 01/04/2018)
snapshot <- '2017-11' 
### --- END define snapshot

### --- projects list
projects <- c('dewiki', 'enwiki', 'frwiki')

### --- dir struct:
baseDir <- '/srv/home/goransm/_miscWMDE/newEditors_MediaWikiHistory'
outDir <- paste(baseDir, '/_results/', sep = "")
scriptDir <- paste(baseDir, '/_script/', sep = "")
setwd(scriptDir)

### --- run HiveQL scripts
for (i in 1:length(projects)) {

  hiveQL <- paste("SELECT event_user_id, SUBSTR(from_utc_timestamp(event_timestamp, 'CET'), 1, 10) 
                    FROM (
                      SELECT *,
                      row_number() OVER (partition by event_user_id ORDER by event_timestamp) rownum
                      FROM wmf.mediawiki_history WHERE wiki_db = '", projects[i],
                      "' AND event_entity = 'revision'
                      AND event_type = 'create'
                      AND event_user_is_created_by_self = true
                      AND event_user_is_bot_by_name = false
                      AND page_namespace = 0 
                      AND page_is_redirect = false
                      AND !(event_user_id = 0) 
                      AND snapshot = '", snapshot,
                  "') tab1 
                    WHERE rownum = 10;",
                  sep = "")
  
  # - write hql
  write(hiveQL, 'newEds10.hql')
  
  ### --- output filename
  filename <- paste('newUsers10_', projects[i],".tsv", sep = "")
  filename <- paste(outDir, filename, sep = "")
  
  ### --- execute hql script:
  hiveArgs <- 'beeline -f'
  hiveInput <- paste(paste(scriptDir, 'newEds10.hql', sep = ""),
                     " > ",
                     filename,
                     sep = "")
  # - command:
  hiveCommand <- paste(hiveArgs, hiveInput)
  system(command = hiveCommand, wait = TRUE)
}
### --- END run HiveQL scripts

### --- anonymize, wrangle, and save
setwd(outDir)
Sys.setlocale("LC_TIME", "C")
lF <- list.files()
lF <- lF[grepl("tsv", lF, fixed = T)]
for (i in 1:length(lF)) {
  project <- strsplit(
    strsplit(lF[i], split = "_", fixed = T)[[1]][2],
    split = ".", fixed = T)[[1]][1]
  dataSet <- readLines(lF[i])
  dataSet <- dataSet[16:(length(dataSet) - 2)]
  User <- sapply(dataSet, function(x) {
    strsplit(x, split = "\t", fixed = T)[[1]][1]
  })
  Date <- sapply(dataSet, function(x) {
    strsplit(x, split = "\t", fixed = T)[[1]][2]
  })
  dsFrame <- data.frame(User = User, 
                        Date = Date, 
                        stringsAsFactors = F,
                        row.names = seq(1, length(User), by = 1))
  rm(Date); rm(User); rm(dataSet); gc()
  dsFrame$Date <- as.Date(dsFrame$Date)
  dsFrame <- dsFrame %>% 
    arrange(Date)
  # - anonymize user Ids
  dsFrame$User <- paste("u_", seq(1, length(dsFrame$User), by = 1))
  # - produce dataset w. daily resoluton
  filename <- paste("NewUsersDaily_", project, ".csv", sep = "")
  dailyRes <- dsFrame %>% 
    group_by(Date) %>% 
    summarise(Count = n())
  dailyRes$Month <- sapply(months(dailyRes$Date), function(x) {
    which(month.name %in% x)
  })
  dailyRes$Year <- year(dailyRes$Date)
  dailyRes$Week <- week(dailyRes$Date)
  dailyRes$DayWeek <- weekdays(dailyRes$Date)
  write.csv(dailyRes, filename)
}
```

## 2. Weekly Trends (Big Picture)

```{r echo = T, warning = 'hide', message = F}
### --- Define projects under consideration:
# - report: current update
print(paste0("Current update: ", as.character(Sys.time())))
projects <- c('dewiki', 'enwiki', 'frwiki')
lF <- list.files('./_results/')
lF <- lF[grepl(".csv", lF, fixed = T)]
dwSets <- list()
### --- Recent weekly trends:
for (i in 1:length(projects)) {
  w <- which(grepl(projects[i], lF, fixed = T))
  dataSet <- read.csv(paste('./_results/', lF[w], sep = ""), 
                      row.names = 1,
                      check.names = F,
                      header = T,
                      stringsAsFactors = F)
  dWeekly <- dataSet
  dWeekly$Month <- sapply(dWeekly$Month, function(x) {
    if (!(nchar(x) == 2)) {
      return(paste("0", x, sep = ""))
    } else {
      return(x)
    }
  })
  dWeekly$Week <- sapply(dWeekly$Week, function(x) {
    if (!(nchar(x) == 2)) {
      return(paste("0", x, sep = ""))
    } else {
      return(x)
    }
  })
  dWeekly$YW <- paste(dWeekly$Year, dWeekly$Week, sep = "-")
  dWeekly <- dWeekly %>% 
    group_by(YW) %>% 
    summarise(Count = sum(Count)) %>% 
    arrange(YW)
  dwSets[[i]] <- dWeekly
  rm(dWeekly); rm(dataSet); gc()
}
### --- produce plotSet
dweeks <- unlist(lapply(dwSets, function(x) {
  x$YW
}))
dMat <- as.data.frame(matrix('', nrow = length(dweeks), ncol = length(projects) + 1), 
                      stringsAsFactors = F)
colnames(dMat)[1] <- 'Week'
colnames(dMat)[2:dim(dMat)[2]] <- projects
dMat[, 1] <- dweeks
for (i in 1:length(dwSets)) {
  w <- which(dMat$Week %in% dwSets[[i]]$YW)
  dMat[, i+1][w] <- dwSets[[i]]$Count 
}
dMat <- dMat %>% 
  gather(key = Project,
         value = Count,
         projects)
dMat$Count <- as.numeric(dMat$Count)
# - x-axis labels
dMat$XLabs <- sapply(dMat$Week, function(x) {
  if (grepl("-01$", x)) {
    return(strsplit(x, split = "-", fixed = T)[[1]][1]) 
  } else {
    return("")
    }
})
# - Visualize w. {ggplot2}
ggplot(dMat, aes(x = Week,
                 y = Count,
                 group = Project,
                 color = Project,
                 fill = Project)) + 
  geom_line(size = .25) + 
  scale_y_continuous(labels = comma) + 
  scale_x_discrete(labels = dMat$XLabs) +
  xlab("Year (weekly data resolution)") +
  ylab("New editors") +
  ggtitle('New Editors (>= 10 edits) Income:\nWeekly comparison (starting at: Week 16. of 2016.)') +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(panel.grid.major.x = element_blank()) + 
  theme(panel.grid.minor.x = element_blank())
```

Log-scale:

```{r echo = T, warning = 'hide', message = F}
# - Visualize w. {ggplot2}
ggplot(dMat, aes(x = Week,
                 y = log(Count),
                 group = Project,
                 color = Project,
                 fill = Project)) + 
  geom_line(size = .25) + 
  scale_y_continuous(labels = comma) + 
  scale_x_discrete(labels = dMat$XLabs) +
  xlab("Year (weekly data resolution)") +
  ylab("log(New editors)") +
  ggtitle('New Editors (>= 10 edits) Income:\nWeekly comparison (starting at: Week 16. of 2016.)') +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(panel.grid.major.x = element_blank()) + 
  theme(panel.grid.minor.x = element_blank())
```

## 3. Monthly Trends: The Previous Six Months

```{r echo = T, warning = 'hide', message = F}
### --- Define projects under consideration/get files
lF <- list.files('./_results/')
lF <- lF[grepl(".csv", lF, fixed = T)]
dwSets <- list()
### --- Monthly trends:
for (i in 1:length(lF)) {
  project <- strsplit(
    strsplit(lF[i], split = "_", fixed = T)[[1]][2],
    split = ".", fixed = T)[[1]][1]
  dS <- read.csv(paste('./_results/', lF[i], sep = ""),
               header = T,
               check.names = F,
               stringsAsFactors = F,
               row.names = 1)
  dS$YearMonth <- paste(dS$Year, 
                        ifelse(nchar(dS$Month) == 2, 
                               dS$Month, 
                               paste("0", dS$Month, sep = "")
                               ),
                        sep = "-")
  dS <- dplyr::select(dS, YearMonth, Count)
  dS$Project <- project
  dwSets[[i]] <- dS
  rm(dS)
}
dwSets <- rbindlist(dwSets)
dwSets <- dwSets %>% 
  group_by(Project, YearMonth) %>% 
  summarise(Editors = sum(Count)) %>% 
  arrange(Project, YearMonth)
### --- determine: last 6 months
actualDate <- as.character(Sys.time())
actualDateY <- as.numeric(strsplit(actualDate, split = "-")[[1]][1])
actualDateM <- as.numeric(strsplit(actualDate, split = "-")[[1]][2])
actualDateM <- actualDateM - 1
actualDateM1 <- actualDateM - 5
monthsSeq <- actualDateM1:actualDateM
yearsSeq <- rep(actualDateY, length(monthsSeq))
yearsSeq[which(monthsSeq <= 0)] <- actualDateY - 1
monthsSeq[which(monthsSeq<= 0)] <- 12 - abs(monthsSeq[which(monthsSeq<= 0)])
targetYM <- paste(yearsSeq, 
                  ifelse(nchar(monthsSeq) == 2,
                         monthsSeq,
                         paste("0", monthsSeq, sep = "")
                         ),
                  sep = "-")
### --- filter: last 6 months
dwSets <- dwSets %>% 
  filter(YearMonth %in% targetYM)
### --- visualize w. {ggplot2}
colnames(dwSets)[2] <- 'Month'
dwSets$Month <- factor(dwSets$Month, levels = sort(unique(dwSets$Month)))
ggplot(dwSets,
       aes(x = Month, y = Editors, label = Editors)) +
          geom_line(size = .25, color = "#4c8cff", group = 1) +
          geom_point(size = 1.5, color = "#4c8cff") + 
          geom_point(size = 1, color = "white") + 
          geom_text_repel(data = dwSets, 
                          aes(x = Month, y = Editors, label = Editors), 
                          size = 3) +
          facet_wrap(~ Project, ncol = 3, scales = "free_y") +
          xlab('Month') + ylab('New Editors (>= 10 edits)') +
          scale_y_continuous(labels = comma) + 
          theme_minimal() +
          theme(axis.text.x = element_text(angle = 90, size = 10, hjust = 1)) +
          theme(axis.title.x = element_text(size = 12)) +
          theme(axis.title.y = element_text(size = 12)) +
          theme(plot.title = element_text(size = 15))
```

## 3. Monthly Trends: The Previous Two Years

```{r echo = T, warning = 'hide', message = F}
### --- Define projects under consideration/get files
lF <- list.files('./_results/')
lF <- lF[grepl(".csv", lF, fixed = T)]
dwSets <- list()
### --- Monthly trends:
for (i in 1:length(lF)) {
  project <- strsplit(
    strsplit(lF[i], split = "_", fixed = T)[[1]][2],
    split = ".", fixed = T)[[1]][1]
  dS <- read.csv(paste('./_results/', lF[i], sep = ""),
               header = T,
               check.names = F,
               stringsAsFactors = F,
               row.names = 1)
  dS$YearMonth <- paste(dS$Year, 
                        ifelse(nchar(dS$Month) == 2, 
                               dS$Month, 
                               paste("0", dS$Month, sep = "")
                               ),
                        sep = "-")
  dS <- dplyr::select(dS, YearMonth, Count)
  dS$Project <- project
  dwSets[[i]] <- dS
  rm(dS)
}
dwSets <- rbindlist(dwSets)
dwSets <- dwSets %>% 
  group_by(Project, YearMonth) %>% 
  summarise(Editors = sum(Count)) %>% 
  arrange(Project, YearMonth)
### --- determine: last 2 years
actualDate <- as.character(Sys.time())
actualDateY <- as.numeric(strsplit(actualDate, split = "-")[[1]][1])
actualDateM <- as.numeric(strsplit(actualDate, split = "-")[[1]][2])
actualDateM <- actualDateM - 1
actualDateM1 <- actualDateM - 12
monthsSeq <- actualDateM1:actualDateM
yearsSeq <- rep(actualDateY, length(monthsSeq))
yearsSeq[which(monthsSeq <= 0)] <- actualDateY - 1
monthsSeq[which(monthsSeq <= 0)] <- 12 - abs(monthsSeq[which(monthsSeq<= 0)])
targetYM <- paste(yearsSeq, 
                  ifelse(nchar(monthsSeq) == 2,
                         monthsSeq,
                         paste("0", monthsSeq, sep = "")
                         ),
                  sep = "-")
targetYM1 <- lapply(targetYM, function(x) {
  x <- strsplit(x, split = "-")[[1]]
  x[1] <- as.numeric(x[1]) - 1
  x <- paste(x, collapse = "-")
  return(x)
})
targetYM <- unique(c(targetYM, unlist(targetYM1)))
### --- filter: last 6 months
dwSets <- dwSets %>% 
  filter(YearMonth %in% targetYM)
### --- visualize w. {ggplot2}
colnames(dwSets)[2] <- 'Month'
dwSets$Month <- factor(dwSets$Month, levels = sort(unique(dwSets$Month)))
ggplot(dwSets,
       aes(x = Month, y = Editors, label = Editors)) +
          geom_line(size = .25, color = "#4c8cff", group = 1) +
          geom_point(size = 1.5, color = "#4c8cff") + 
          geom_point(size = 1, color = "white") + 
          geom_text_repel(data = dwSets, 
                          aes(x = Month, y = Editors, label = Editors), 
                          size = 3) +
          facet_wrap(~ Project, ncol = 1, scales = "free_y") +
          xlab('Month') + ylab('New Editors (>= 10 edits)') +
          scale_y_continuous(labels = comma) + 
          theme_minimal() +
          theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
          theme(axis.title.x = element_text(size = 12)) +
          theme(axis.title.y = element_text(size = 12)) +
          theme(plot.title = element_text(size = 15))
```

## 4. dewiki Forecast

The optimal ARIMA forecast for `dewiki`, starting from the first year with a complete monthly dataset (2007):

```{r echo = T, warning = 'hide', message = F}
### --- Data
dataSet <- read.csv(paste(getwd(), '/_results/NewUsersDaily_dewiki.csv', sep = ''),
                    header = T,
                    check.names = F,
                    stringsAsFactors = F,
                    row.names = 1)
### --- Wrangle
dataSet$Month <- sapply(dataSet$Month, function(x) {
  if(nchar(x) == 1) {
    x <- paste("0", x, sep = "")
  }
  x
})
dataSet <- arrange(dataSet, Year, Month)
# - complete data since 2007:
completeYears <- 2007:2017
wComplete <- rowSums(sapply(completeYears, function(x) {
  grepl(x, dataSet$Year)
  }))
dataSet <- dataSet[as.logical(wComplete), ]
# - summarise per month and drop incomplete data for the last month:
dataSet <- dataSet %>% 
  select(Year, Month, Count) %>% 
  group_by(Year, Month) %>% 
  summarise(Edits = sum(Count))
### --- as time series object:
timeEds <- ts(dataSet$Edits, 
              start = c(2007, 1), 
              end = c(2017, as.numeric(dataSet$Month[dim(dataSet)[1]])), 
              frequency = 12)
### --- ARIMA model
timeEdsARIMA <- auto.arima(timeEds, D = 1, seasonal = T)
timeEdsARIMA
plot(forecast(timeEdsARIMA))
```
