--
-- Script generating one day of project cube datasets in gzipped tsv format (no headers).
-- For daily aggregates, time = '-'.
-- This script removes data points where view_count < 100 (k-anonymity).
-- Columns: day, time, project, agent_type, view_count
-- Usage: hive -f project_cube.hql -d destination_directory=/tmp/test -d year=2015 -d month=7 -d day=1
--


SET hive.exec.compress.output=true;
SET mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
SET mapred.reduce.tasks              = 1;


INSERT OVERWRITE DIRECTORY "${destination_directory}"
    SELECT
        CONCAT_WS("\t",
            CONCAT(
                LPAD(year, 4, "0"), "-",
                LPAD(month, 2, "0"), "-",
                LPAD(day, 2, "0")
            ),
            CASE
                WHEN hour IS NULL THEN "-"
                ELSE CONCAT(LPAD(hour, 2, "0"), ":00:00")
            END,
            project,
            agent_type,
            CAST(view_count AS STRING)
        ) line
    FROM (
        SELECT
            year,
            month,
            day,
            hour,
            project,
            agent_type,
            SUM(view_count) AS view_count
        FROM wmf.projectview_hourly
        WHERE year=${year}
            AND month=${month}
            AND day=${day}
        GROUP BY
            year,
            month,
            day,
            hour,
            project,
            agent_type
        HAVING SUM(view_count) > 99

        UNION ALL

        SELECT
            year,
            month,
            day,
            NULL AS hour,
            project,
            agent_type,
            SUM(view_count) AS view_count
        FROM wmf.projectview_hourly
        WHERE year=${year}
            AND month=${month}
            AND day=${day}
        GROUP BY
            year,
            month,
            day,
            project,
            agent_type
        HAVING SUM(view_count) > 99
    ) hours_and_days
    ORDER BY line
    LIMIT 1000000
;