-- -- Script generating one day of project cube datasets in gzipped tsv format (no headers). -- For daily aggregates, time = '-'. -- This script removes data points where view_count < 100 (k-anonymity). -- Columns: day, time, project, agent_type, view_count -- Usage: hive -f project_cube.hql -d destination_directory=/tmp/test -d year=2015 -d month=7 -d day=1 -- SET hive.exec.compress.output=true; SET mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec; SET mapred.reduce.tasks = 1; INSERT OVERWRITE DIRECTORY "${destination_directory}" SELECT CONCAT_WS("\t", CONCAT( LPAD(year, 4, "0"), "-", LPAD(month, 2, "0"), "-", LPAD(day, 2, "0") ), CASE WHEN hour IS NULL THEN "-" ELSE CONCAT(LPAD(hour, 2, "0"), ":00:00") END, project, agent_type, CAST(view_count AS STRING) ) line FROM ( SELECT year, month, day, hour, project, agent_type, SUM(view_count) AS view_count FROM wmf.projectview_hourly WHERE year=${year} AND month=${month} AND day=${day} GROUP BY year, month, day, hour, project, agent_type HAVING SUM(view_count) > 99 UNION ALL SELECT year, month, day, NULL AS hour, project, agent_type, SUM(view_count) AS view_count FROM wmf.projectview_hourly WHERE year=${year} AND month=${month} AND day=${day} GROUP BY year, month, day, project, agent_type HAVING SUM(view_count) > 99 ) hours_and_days ORDER BY line LIMIT 1000000 ;