#! /bin/bash
DESCRIPTION="Get page IDs of articles translated by Toledo"
THISSCRIPTFILE=`basename "$0"`
RESULTSFILE=~/${THISSCRIPTFILE%.*}_result.txt
{ date ; echo = ; TZ='America/Los_Angeles' date ; echo generated by $THISSCRIPTFILE on $HOSTNAME ; 
d="2019-03-18"
while [ "$d" != 2019-06-15 ]
do 
  echo $d
  DAY=$(date -d "$d" '+%d')
  MONTH=$(date -d "$d" '+%m')
  beeline --verbose=true -e "
  SET hive.exec.dynamic.partition = true;
  SET hive.exec.dynamic.partition.mode = nonstrict;
  CREATE EXTERNAL TABLE IF NOT EXISTS chelsyx.toledo_pageid
    (
        \`page_id\`              bigint  COMMENT 'The ID of the article',
        \`database_code\`        string  COMMENT 'The code of the wiki'
    )
    PARTITIONED BY (
        \`year\`              int     COMMENT 'Unpadded year',
        \`month\`             int     COMMENT 'Unpadded month',
        \`day\`               int     COMMENT 'Unpadded day'
    )
    STORED AS PARQUET
    LOCATION '/user/chelsyx/toledo_pageid'
    ;

    INSERT INTO TABLE chelsyx.toledo_pageid partition(year,month,day)

    select distinct r.page_id, w.database_code, r.year, r.month, r.day
    from wmf.webrequest r join canonical_data.wikis w on CONCAT(r.pageview_info['project'], '.org') = w.domain_name
    where
        year = 2019
        and month = ${MONTH}
        and day = ${DAY}
        and webrequest_source = 'text'
        and is_pageview
        and namespace_id=0
        and x_analytics_map['translationengine'] = 'GT'
        and parse_url(referer, 'QUERY') like '%client=srp%'
        and (regexp_extract(parse_url(referer, 'QUERY'), '(^|[&?])hl=([^&]*)', 2) = 'id'
        or regexp_extract(parse_url(referer, 'QUERY'), '(^|[&?])tl=([^&]*)', 2) = 'id')
    ;
  "
  d=$(date -I -d "$d + 1 day")
done
} &> $RESULTSFILE