#! /bin/bash DESCRIPTION="Get page IDs of articles translated by Toledo" THISSCRIPTFILE=`basename "$0"` RESULTSFILE=~/${THISSCRIPTFILE%.*}_result.txt { date ; echo = ; TZ='America/Los_Angeles' date ; echo generated by $THISSCRIPTFILE on $HOSTNAME ; d="2019-03-18" while [ "$d" != 2019-06-15 ] do echo $d DAY=$(date -d "$d" '+%d') MONTH=$(date -d "$d" '+%m') beeline --verbose=true -e " SET hive.exec.dynamic.partition = true; SET hive.exec.dynamic.partition.mode = nonstrict; CREATE EXTERNAL TABLE IF NOT EXISTS chelsyx.toledo_pageid ( \`page_id\` bigint COMMENT 'The ID of the article', \`database_code\` string COMMENT 'The code of the wiki' ) PARTITIONED BY ( \`year\` int COMMENT 'Unpadded year', \`month\` int COMMENT 'Unpadded month', \`day\` int COMMENT 'Unpadded day' ) STORED AS PARQUET LOCATION '/user/chelsyx/toledo_pageid' ; INSERT INTO TABLE chelsyx.toledo_pageid partition(year,month,day) select distinct r.page_id, w.database_code, r.year, r.month, r.day from wmf.webrequest r join canonical_data.wikis w on CONCAT(r.pageview_info['project'], '.org') = w.domain_name where year = 2019 and month = ${MONTH} and day = ${DAY} and webrequest_source = 'text' and is_pageview and namespace_id=0 and x_analytics_map['translationengine'] = 'GT' and parse_url(referer, 'QUERY') like '%client=srp%' and (regexp_extract(parse_url(referer, 'QUERY'), '(^|[&?])hl=([^&]*)', 2) = 'id' or regexp_extract(parse_url(referer, 'QUERY'), '(^|[&?])tl=([^&]*)', 2) = 'id') ; " d=$(date -I -d "$d + 1 day") done } &> $RESULTSFILE