-- -- Script generating one day of page_title hourly datasets in gzipped tsv format (no headers). -- This script purposedly exclude bots and undefined page_title ('-'). -- This script removes data points where view_count < 100 (k-anonymity). -- Columns: day, time, project, language_variant, page_title, view_count -- Usage: hive -f page_title_hourly.hql -d destination_directory=/tmp/test -d year=2015 -d month=7 -d day=1 -- SET hive.exec.compress.output=true; SET mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec; SET mapred.reduce.tasks = 1; INSERT OVERWRITE DIRECTORY "${destination_directory}" SELECT CONCAT_WS("\t", CONCAT( LPAD(year, 4, "0"), "-", LPAD(month, 2, "0"), "-", LPAD(day, 2, "0") ), CONCAT(LPAD(hour, 2, "0"), ":00:00"), project, language_variant, page_title, CAST(SUM(view_count) AS STRING) ) line FROM wmf.pageview_hourly WHERE year=${year} AND month=${month} AND day=${day} AND agent_type = 'user' AND page_title != '-' GROUP BY year, month, day, hour, project, language_variant, page_title HAVING SUM(view_count) > 99 ORDER BY line LIMIT 1000000000 ;