Feedback should be send to goran.milovanovic_ext@wikimedia.de.

1. Data Acquistion

The Data Acquisition code chunk is not reproducible from this report. It is run as an newEds_mediawiki_history.R script in production (currently stat1005). The userIds are anonymized and the dataset is copied mannually from production and processed locally to produce this Report.

### --- Script: newEds_mediawiki_history.R
### --- the following runs on stat1005.eqiad.wmnet
### --- Rscript /home/goransm/_miscWMDE/newEditors_MediaWikiHistory/newEds_mediawiki_history.R

### --- The script collects (a) userIds, and (b) dates (yyyymmdd) on which a particular user
### --- has reached >= 10 edits on a given project.
### --- The datasets are used for the New Editors Report on `dewiki`

### --- Goran S. Milovanovic, Data Analyst, WMDE
### --- October 16, 2017.

rm(list = ls())
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)

### --- Define snapshot for wmf.mediawiki_history
snapshot <- as.character(Sys.time())
snapshotY <- strsplit(snapshot, split = "-")[[1]][1]
snapshotM <- strsplit(snapshot, split = "-")[[1]][2]
snapshot <- paste(snapshotY, snapshotM, sep = "-")
### --- NOTE OCTOBER SNAPSHOT NOT READY (11/11/2017)
snapshot <- '2017-09' 
### --- END define snapshot

### --- projects list
projects <- c('dewiki', 'enwiki', 'frwiki')

### --- dir struct:
baseDir <- '/srv/home/goransm/_miscWMDE/newEditors_MediaWikiHistory'
outDir <- paste(baseDir, '/_results/', sep = "")
scriptDir <- paste(baseDir, '/_script/', sep = "")
setwd(scriptDir)

### --- run HiveQL scripts
for (i in 1:length(projects)) {
  
  hiveQL <- paste("SELECT event_user_id, SUBSTR(from_utc_timestamp(event_timestamp, 'CET'), 1, 10) 
                    FROM (
                      SELECT *,
                      row_number() OVER (partition by event_user_id ORDER by event_timestamp) rownum
                      FROM wmf.mediawiki_history WHERE wiki_db = '", projects[i],
                      "' AND event_entity = 'revision'
                      AND event_type = 'create'
                      AND event_user_is_created_by_self = true
                      AND event_user_is_bot_by_name = false
                      AND page_namespace = 0 
                      AND page_namespace_latest = 0 
                      AND page_is_redirect_latest = false
                      AND !(event_user_id = 0) 
                      AND snapshot = '", snapshot,
                  "') tab1 
                    WHERE rownum = 10;",
                  sep = "")
  
  # - write hql
  write(hiveQL, 'newEds10.hql')
  
  ### --- output filename
  filename <- paste('newUsers10_', projects[i],".tsv", sep = "")
  filename <- paste(outDir, filename, sep = "")
  
  ### --- execute hql script:
  hiveArgs <- 'beeline -f'
  hiveInput <- paste(paste(scriptDir, 'newEds10.hql', sep = ""),
                     " > ",
                     filename,
                     sep = "")
  # - command:
  hiveCommand <- paste(hiveArgs, hiveInput)
  system(command = hiveCommand, wait = TRUE)
}
### --- END run HiveQL scripts

### --- anonymize, wrangle, and save
setwd(outDir)
Sys.setlocale("LC_TIME", "C")
lF <- list.files()
lF <- lF[grepl("tsv", lF, fixed = T)]
for (i in 1:length(lF)) {
  project <- strsplit(
    strsplit(lF[i], split = "_", fixed = T)[[1]][2],
    split = ".", fixed = T)[[1]][1]
  dataSet <- readLines(lF[i])
  dataSet <- dataSet[16:(length(dataSet) - 2)]
  User <- sapply(dataSet, function(x) {
    strsplit(x, split = "\t", fixed = T)[[1]][1]
  })
  Date <- sapply(dataSet, function(x) {
    strsplit(x, split = "\t", fixed = T)[[1]][2]
  })
  dsFrame <- data.frame(User = User, 
                        Date = Date, 
                        stringsAsFactors = F,
                        row.names = seq(1, length(User), by = 1))
  rm(Date); rm(User); rm(dataSet); gc()
  dsFrame$Date <- as.Date(dsFrame$Date)
  dsFrame <- dsFrame %>% 
    arrange(Date)
  # - anonymize user Ids
  dsFrame$User <- paste("u_", seq(1, length(dsFrame$User), by = 1))
  # - produce dataset w. daily resoluton
  filename <- paste("NewUsersDaily_", project, ".csv", sep = "")
  dailyRes <- dsFrame %>% 
    group_by(Date) %>% 
    summarise(Count = n())
  dailyRes$Month <- sapply(months(dailyRes$Date), function(x) {
    which(month.name %in% x)
  })
  dailyRes$Year <- year(dailyRes$Date)
  dailyRes$Week <- week(dailyRes$Date)
  dailyRes$DayWeek <- weekdays(dailyRes$Date)
  write.csv(dailyRes, filename)
}

4. dewiki Forecast

The optimal ARIMA forecast for dewiki, starting from the first year with a complete monthly dataset (2007):

library(tseries)
library(forecast)
### --- Data
dataSet <- read.csv(paste(getwd(), '/_results/NewUsersDaily_dewiki.csv', sep = ''),
                    header = T,
                    check.names = F,
                    stringsAsFactors = F,
                    row.names = 1)
### --- Wrangle
dataSet$Month <- sapply(dataSet$Month, function(x) {
  if(nchar(x) == 1) {
    x <- paste("0", x, sep = "")
  }
  x
})
dataSet <- arrange(dataSet, Year, Month)
# - complete data since 2007:
completeYears <- 2007:2017
wComplete <- rowSums(sapply(completeYears, function(x) {
  grepl(x, dataSet$Year)
  }))
dataSet <- dataSet[as.logical(wComplete), ]
# - summarise per month and drop incomplete data for the last month:
dataSet <- dataSet %>% 
  select(Year, Month, Count) %>% 
  group_by(Year, Month) %>% 
  summarise(Edits = sum(Count))
### --- as time series object:
timeEds <- ts(dataSet$Edits, 
              start = c(2007, 1), 
              end = c(2017, as.numeric(dataSet$Month[dim(dataSet)[1]])), 
              frequency = 12)
### --- ARIMA model
timeEdsARIMA <- auto.arima(timeEds, D = 1, seasonal = T)
timeEdsARIMA
Series: timeEds 
ARIMA(0,1,1)(2,1,0)[12] 

Coefficients:
          ma1     sar1     sar2
      -0.5150  -0.9034  -0.4241
s.e.   0.0793   0.0887   0.0946

sigma^2 estimated as 2999:  log likelihood=-633.05
AIC=1274.1   AICc=1274.46   BIC=1285.11
plot(forecast(timeEdsARIMA))

---
title: "New Editors on de.wikipedia.org"
author: "Goran S. Milovanovic, Data Scientist, WMDE"
date: "November 11, 2017"
output:
  html_notebook:
    code_folding: hide
    theme: simplex
    toc: yes
    toc_depth: 2
    toc_float: yes
  html_document:
    toc: yes
    toc_depth: 2
---

**Feedback** should be send to `goran.milovanovic_ext@wikimedia.de`. 

```{r, echo = F, warning = F, message = F, results = 'hide'}
# !diagnostics off
### --- Setup
knitr::opts_chunk$set(fig.width = 15, fig.height = 8) 
library(stringr)
library(dplyr)
library(tidyr)
library(data.table)
library(ggplot2)
library(ggrepel)
library(scales)
library(RColorBrewer)
library(kableExtra)
library(rmarkdown)
library(knitr)
library(DT)
```

## 1. Data Acquistion

The Data Acquisition code chunk is not reproducible from this report. It is run as an `newEds_mediawiki_history.R` script in production (currently `stat1005`). The `userIds` are anonymized and the dataset is copied mannually from production and processed locally to produce this Report.

```{r, echo = T, eval = F}
### --- Script: newEds_mediawiki_history.R
### --- the following runs on stat1005.eqiad.wmnet
### --- Rscript /home/goransm/_miscWMDE/newEditors_MediaWikiHistory/newEds_mediawiki_history.R

### --- The script collects (a) userIds, and (b) dates (yyyymmdd) on which a particular user
### --- has reached >= 10 edits on a given project.
### --- The datasets are used for the New Editors Report on `dewiki`

### --- Goran S. Milovanovic, Data Analyst, WMDE
### --- October 16, 2017.

rm(list = ls())
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)

### --- Define snapshot for wmf.mediawiki_history
snapshot <- as.character(Sys.time())
snapshotY <- strsplit(snapshot, split = "-")[[1]][1]
snapshotM <- strsplit(snapshot, split = "-")[[1]][2]
snapshot <- paste(snapshotY, snapshotM, sep = "-")
### --- NOTE OCTOBER SNAPSHOT NOT READY (11/11/2017)
snapshot <- '2017-09' 
### --- END define snapshot

### --- projects list
projects <- c('dewiki', 'enwiki', 'frwiki')

### --- dir struct:
baseDir <- '/srv/home/goransm/_miscWMDE/newEditors_MediaWikiHistory'
outDir <- paste(baseDir, '/_results/', sep = "")
scriptDir <- paste(baseDir, '/_script/', sep = "")
setwd(scriptDir)

### --- run HiveQL scripts
for (i in 1:length(projects)) {
  
  hiveQL <- paste("SELECT event_user_id, SUBSTR(from_utc_timestamp(event_timestamp, 'CET'), 1, 10) 
                    FROM (
                      SELECT *,
                      row_number() OVER (partition by event_user_id ORDER by event_timestamp) rownum
                      FROM wmf.mediawiki_history WHERE wiki_db = '", projects[i],
                      "' AND event_entity = 'revision'
                      AND event_type = 'create'
                      AND event_user_is_created_by_self = true
                      AND event_user_is_bot_by_name = false
                      AND page_namespace = 0 
                      AND page_namespace_latest = 0 
                      AND page_is_redirect_latest = false
                      AND !(event_user_id = 0) 
                      AND snapshot = '", snapshot,
                  "') tab1 
                    WHERE rownum = 10;",
                  sep = "")
  
  # - write hql
  write(hiveQL, 'newEds10.hql')
  
  ### --- output filename
  filename <- paste('newUsers10_', projects[i],".tsv", sep = "")
  filename <- paste(outDir, filename, sep = "")
  
  ### --- execute hql script:
  hiveArgs <- 'beeline -f'
  hiveInput <- paste(paste(scriptDir, 'newEds10.hql', sep = ""),
                     " > ",
                     filename,
                     sep = "")
  # - command:
  hiveCommand <- paste(hiveArgs, hiveInput)
  system(command = hiveCommand, wait = TRUE)
}
### --- END run HiveQL scripts

### --- anonymize, wrangle, and save
setwd(outDir)
Sys.setlocale("LC_TIME", "C")
lF <- list.files()
lF <- lF[grepl("tsv", lF, fixed = T)]
for (i in 1:length(lF)) {
  project <- strsplit(
    strsplit(lF[i], split = "_", fixed = T)[[1]][2],
    split = ".", fixed = T)[[1]][1]
  dataSet <- readLines(lF[i])
  dataSet <- dataSet[16:(length(dataSet) - 2)]
  User <- sapply(dataSet, function(x) {
    strsplit(x, split = "\t", fixed = T)[[1]][1]
  })
  Date <- sapply(dataSet, function(x) {
    strsplit(x, split = "\t", fixed = T)[[1]][2]
  })
  dsFrame <- data.frame(User = User, 
                        Date = Date, 
                        stringsAsFactors = F,
                        row.names = seq(1, length(User), by = 1))
  rm(Date); rm(User); rm(dataSet); gc()
  dsFrame$Date <- as.Date(dsFrame$Date)
  dsFrame <- dsFrame %>% 
    arrange(Date)
  # - anonymize user Ids
  dsFrame$User <- paste("u_", seq(1, length(dsFrame$User), by = 1))
  # - produce dataset w. daily resoluton
  filename <- paste("NewUsersDaily_", project, ".csv", sep = "")
  dailyRes <- dsFrame %>% 
    group_by(Date) %>% 
    summarise(Count = n())
  dailyRes$Month <- sapply(months(dailyRes$Date), function(x) {
    which(month.name %in% x)
  })
  dailyRes$Year <- year(dailyRes$Date)
  dailyRes$Week <- week(dailyRes$Date)
  dailyRes$DayWeek <- weekdays(dailyRes$Date)
  write.csv(dailyRes, filename)
}
```

## 2. Weekly Trends (Big Picture)

```{r echo = T, warning = 'hide', message = F}
### --- Define projects under consideration:
projects <- c('dewiki', 'enwiki', 'frwiki')
lF <- list.files('./_results/')
lF <- lF[grepl(".csv", lF, fixed = T)]
dwSets <- list()
### --- Recent weekly trends:
for (i in 1:length(projects)) {
  w <- which(grepl(projects[i], lF, fixed = T))
  dataSet <- read.csv(paste('./_results/', lF[w], sep = ""), 
                      row.names = 1,
                      check.names = F,
                      header = T,
                      stringsAsFactors = F)
  dWeekly <- dataSet
  dWeekly$Month <- sapply(dWeekly$Month, function(x) {
    if (!(nchar(x) == 2)) {
      return(paste("0", x, sep = ""))
    } else {
      return(x)
    }
  })
  dWeekly$Week <- sapply(dWeekly$Week, function(x) {
    if (!(nchar(x) == 2)) {
      return(paste("0", x, sep = ""))
    } else {
      return(x)
    }
  })
  dWeekly$YW <- paste(dWeekly$Year, dWeekly$Week, sep = "-")
  dWeekly <- dWeekly %>% 
    group_by(YW) %>% 
    summarise(Count = sum(Count)) %>% 
    arrange(YW)
  dwSets[[i]] <- dWeekly
  rm(dWeekly); rm(dataSet); gc()
}
### --- produce plotSet
dweeks <- unlist(lapply(dwSets, function(x) {
  x$YW
}))
dMat <- as.data.frame(matrix('', nrow = length(dweeks), ncol = length(projects) + 1), 
                      stringsAsFactors = F)
colnames(dMat)[1] <- 'Week'
colnames(dMat)[2:dim(dMat)[2]] <- projects
dMat[, 1] <- dweeks
for (i in 1:length(dwSets)) {
  w <- which(dMat$Week %in% dwSets[[i]]$YW)
  dMat[, i+1][w] <- dwSets[[i]]$Count 
}
dMat <- dMat %>% 
  gather(key = Project,
         value = Count,
         projects)
dMat$Count <- as.numeric(dMat$Count)
# - x-axis labels
dMat$XLabs <- sapply(dMat$Week, function(x) {
  if (grepl("-01$", x)) {
    return(strsplit(x, split = "-", fixed = T)[[1]][1]) 
  } else {
    return("")
    }
})
# - Visualize w. {ggplot2}
ggplot(dMat, aes(x = Week,
                 y = Count,
                 group = Project,
                 color = Project,
                 fill = Project)) + 
  geom_line(size = .25) + 
  scale_y_continuous(labels = comma) + 
  scale_x_discrete(labels = dMat$XLabs) +
  xlab("Year (weekly data resolution)") +
  ylab("New editors") +
  ggtitle('New Editors (>= 10 edits) Income:\nWeekly comparison (starting at: Week 16. of 2016.)') +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(panel.grid.major.x = element_blank()) + 
  theme(panel.grid.minor.x = element_blank())
```

Log-scale:

```{r echo = T, warning = 'hide', message = F}
# - Visualize w. {ggplot2}
ggplot(dMat, aes(x = Week,
                 y = log(Count),
                 group = Project,
                 color = Project,
                 fill = Project)) + 
  geom_line(size = .25) + 
  scale_y_continuous(labels = comma) + 
  scale_x_discrete(labels = dMat$XLabs) +
  xlab("Year (weekly data resolution)") +
  ylab("log(New editors)") +
  ggtitle('New Editors (>= 10 edits) Income:\nWeekly comparison (starting at: Week 16. of 2016.)') +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(panel.grid.major.x = element_blank()) + 
  theme(panel.grid.minor.x = element_blank())
```

## 3. Monthly Trends: The Previous Six Months

```{r echo = T, warning = 'hide', message = F}
### --- Define projects under consideration/get files
lF <- list.files('./_results/')
lF <- lF[grepl(".csv", lF, fixed = T)]
dwSets <- list()
### --- Monthly trends:
for (i in 1:length(lF)) {
  project <- strsplit(
    strsplit(lF[i], split = "_", fixed = T)[[1]][2],
    split = ".", fixed = T)[[1]][1]
  dS <- read.csv(paste('./_results/', lF[i], sep = ""),
               header = T,
               check.names = F,
               stringsAsFactors = F,
               row.names = 1)
  dS$YearMonth <- paste(dS$Year, 
                        ifelse(nchar(dS$Month) == 2, 
                               dS$Month, 
                               paste("0", dS$Month, sep = "")
                               ),
                        sep = "-")
  dS <- dplyr::select(dS, YearMonth, Count)
  dS$Project <- project
  dwSets[[i]] <- dS
  rm(dS)
}
dwSets <- rbindlist(dwSets)
dwSets <- dwSets %>% 
  group_by(Project, YearMonth) %>% 
  summarise(Editors = sum(Count)) %>% 
  arrange(Project, YearMonth)
### --- determine: last 6 months
actualDate <- as.character(Sys.time())
actualDateY <- as.numeric(strsplit(actualDate, split = "-")[[1]][1])
actualDateM <- as.numeric(strsplit(actualDate, split = "-")[[1]][2])
actualDateM <- actualDateM - 1
actualDateM1 <- actualDateM - 5
monthsSeq <- actualDateM1:actualDateM
yearsSeq <- rep(actualDateY, length(monthsSeq))
yearsSeq[which(monthsSeq <= 0)] <- actualDateY - 1
monthsSeq[which(monthsSeq<= 0)] <- 12 - abs(monthsSeq[which(monthsSeq<= 0)])
targetYM <- paste(yearsSeq, 
                  ifelse(nchar(monthsSeq) == 2,
                         monthsSeq,
                         paste("0", monthsSeq, sep = "")
                         ),
                  sep = "-")
### --- filter: last 6 months
dwSets <- dwSets %>% 
  filter(YearMonth %in% targetYM)
### --- visualize w. {ggplot2}
colnames(dwSets)[2] <- 'Month'
dwSets$Month <- factor(dwSets$Month, levels = sort(unique(dwSets$Month)))
ggplot(dwSets,
       aes(x = Month, y = Editors, label = Editors)) +
          geom_line(size = .25, color = "#4c8cff", group = 1) +
          geom_point(size = 1.5, color = "#4c8cff") + 
          geom_point(size = 1, color = "white") + 
          geom_text_repel(data = dwSets, 
                          aes(x = Month, y = Editors, label = Editors), 
                          size = 3) +
          facet_wrap(~ Project, ncol = 3, scales = "free_y") +
          xlab('Month') + ylab('New Editors (>= 10 edits)') +
          scale_y_continuous(labels = comma) + 
          theme_minimal() +
          theme(axis.text.x = element_text(angle = 90, size = 10, hjust = 1)) +
          theme(axis.title.x = element_text(size = 12)) +
          theme(axis.title.y = element_text(size = 12)) +
          theme(plot.title = element_text(size = 15))
```

## 3. Monthly Trends: The Previous Two Years

```{r echo = T, warning = 'hide', message = F}
### --- Define projects under consideration/get files
lF <- list.files('./_results/')
lF <- lF[grepl(".csv", lF, fixed = T)]
dwSets <- list()
### --- Monthly trends:
for (i in 1:length(lF)) {
  project <- strsplit(
    strsplit(lF[i], split = "_", fixed = T)[[1]][2],
    split = ".", fixed = T)[[1]][1]
  dS <- read.csv(paste('./_results/', lF[i], sep = ""),
               header = T,
               check.names = F,
               stringsAsFactors = F,
               row.names = 1)
  dS$YearMonth <- paste(dS$Year, 
                        ifelse(nchar(dS$Month) == 2, 
                               dS$Month, 
                               paste("0", dS$Month, sep = "")
                               ),
                        sep = "-")
  dS <- dplyr::select(dS, YearMonth, Count)
  dS$Project <- project
  dwSets[[i]] <- dS
  rm(dS)
}
dwSets <- rbindlist(dwSets)
dwSets <- dwSets %>% 
  group_by(Project, YearMonth) %>% 
  summarise(Editors = sum(Count)) %>% 
  arrange(Project, YearMonth)
### --- determine: last 2 years
actualDate <- as.character(Sys.time())
actualDateY <- as.numeric(strsplit(actualDate, split = "-")[[1]][1])
actualDateM <- as.numeric(strsplit(actualDate, split = "-")[[1]][2])
actualDateM <- actualDateM - 1
actualDateM1 <- actualDateM - 12
monthsSeq <- actualDateM1:actualDateM
yearsSeq <- rep(actualDateY, length(monthsSeq))
yearsSeq[which(monthsSeq <= 0)] <- actualDateY - 1
monthsSeq[which(monthsSeq <= 0)] <- 12 - abs(monthsSeq[which(monthsSeq<= 0)])
targetYM <- paste(yearsSeq, 
                  ifelse(nchar(monthsSeq) == 2,
                         monthsSeq,
                         paste("0", monthsSeq, sep = "")
                         ),
                  sep = "-")
targetYM1 <- lapply(targetYM, function(x) {
  x <- strsplit(x, split = "-")[[1]]
  x[1] <- as.numeric(x[1]) - 1
  x <- paste(x, collapse = "-")
  return(x)
})
targetYM <- unique(c(targetYM, unlist(targetYM1)))
### --- filter: last 6 months
dwSets <- dwSets %>% 
  filter(YearMonth %in% targetYM)
### --- visualize w. {ggplot2}
colnames(dwSets)[2] <- 'Month'
dwSets$Month <- factor(dwSets$Month, levels = sort(unique(dwSets$Month)))
ggplot(dwSets,
       aes(x = Month, y = Editors, label = Editors)) +
          geom_line(size = .25, color = "#4c8cff", group = 1) +
          geom_point(size = 1.5, color = "#4c8cff") + 
          geom_point(size = 1, color = "white") + 
          geom_text_repel(data = dwSets, 
                          aes(x = Month, y = Editors, label = Editors), 
                          size = 3) +
          facet_wrap(~ Project, ncol = 1, scales = "free_y") +
          xlab('Month') + ylab('New Editors (>= 10 edits)') +
          scale_y_continuous(labels = comma) + 
          theme_minimal() +
          theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
          theme(axis.title.x = element_text(size = 12)) +
          theme(axis.title.y = element_text(size = 12)) +
          theme(plot.title = element_text(size = 15))
```

## 4. dewiki Forecast

The optimal ARIMA forecast for `dewiki`, starting from the first year with a complete monthly dataset (2007):

```{r echo = T, warning = 'hide', message = F}
library(tseries)
library(forecast)
### --- Data
dataSet <- read.csv(paste(getwd(), '/_results/NewUsersDaily_dewiki.csv', sep = ''),
                    header = T,
                    check.names = F,
                    stringsAsFactors = F,
                    row.names = 1)
### --- Wrangle
dataSet$Month <- sapply(dataSet$Month, function(x) {
  if(nchar(x) == 1) {
    x <- paste("0", x, sep = "")
  }
  x
})
dataSet <- arrange(dataSet, Year, Month)
# - complete data since 2007:
completeYears <- 2007:2017
wComplete <- rowSums(sapply(completeYears, function(x) {
  grepl(x, dataSet$Year)
  }))
dataSet <- dataSet[as.logical(wComplete), ]
# - summarise per month and drop incomplete data for the last month:
dataSet <- dataSet %>% 
  select(Year, Month, Count) %>% 
  group_by(Year, Month) %>% 
  summarise(Edits = sum(Count))
### --- as time series object:
timeEds <- ts(dataSet$Edits, 
              start = c(2007, 1), 
              end = c(2017, as.numeric(dataSet$Month[dim(dataSet)[1]])), 
              frequency = 12)
### --- ARIMA model
timeEdsARIMA <- auto.arima(timeEds, D = 1, seasonal = T)
timeEdsARIMA
plot(forecast(timeEdsARIMA))
```
