USE wmf_raw; ADD JAR hdfs:///wmf/refinery/current/artifacts/refinery-hive.jar; CREATE TEMPORARY FUNCTION is_spider as 'org.wikimedia.analytics.refinery.hive.IsSpiderUDF'; CREATE TEMPORARY FUNCTION ua_parse as 'org.wikimedia.analytics.refinery.hive.GetUAPropertiesUDF'; CREATE TEMPORARY FUNCTION host_parse as 'org.wikimedia.analytics.refinery.hive.GetHostPropertiesUDF'; WITH profile_all AS ( -- We need to construct a '&ns0=1&...&ns####=1& that covers all the namespaces within a wiki when the user uses the "All" search profile SELECT dbname AS wiki, CONCAT('&ns0', REGEXP_REPLACE(CONCAT('ns', CONCAT_WS('ns', COLLECT_SET(CAST(namespace AS STRING)))), 'ns', '=1&ns')) AS ns_qs FROM mediawiki_project_namespace_map WHERE snapshot = '2017-07' AND namespace > 0 GROUP BY dbname ), requests AS ( SELECT id, wiki_id, CASE WHEN INSTR(payload_qs, 'profile=advanced') > 0 THEN payload_qs -- Build a new "queryString" for searches using the non-advanced profiles: WHEN (INSTR(payload_qs, 'profile=') = 0 OR INSTR(payload_qs, 'profile=default') > 0) THEN CONCAT(payload_qs, '&ns0=1') WHEN INSTR(payload_qs, 'profile=images') > 0 THEN CONCAT(payload_qs, '&ns6=1') WHEN INSTR(payload_qs, 'profile=all') > 0 THEN CONCAT(payload_qs, profile_all.ns_qs) END AS query_string FROM ( SELECT id AS id, wikiid AS wiki_id, payload['queryString'] AS payload_qs FROM CirrusSearchRequestSet WHERE year = ${year} AND month = ${month} AND day = ${day} AND source = 'web' AND INSTR(payload['queryString'], 'fulltext=1') > 0 -- Filter out searches with weird profiles like Translations and Discussions ("profile=thread"): AND (INSTR(payload['queryString'], 'profile=') = 0 OR payload['queryString'] RLIKE 'profile=((advanced)|(default)|(images)|(all))') -- Remove possible bots: AND NOT ( useragent IS NULL OR useragent = '' OR ua_parse(useragent)['device_family'] = 'Spider' OR is_spider(useragent) OR ip = '127.0.0.1' OR useragent RLIKE 'https?://' OR INSTR(useragent, 'www.') > 0 OR INSTR(useragent, 'github') > 0 OR LOWER(useragent) RLIKE '([a-z0-9._%-]+@[a-z0-9.-]+\\.(com|us|net|org|edu|gov|io|ly|co|uk))' OR ( ua_parse(useragent)['browser_family'] = 'Other' AND ua_parse(useragent)['device_family'] = 'Other' AND ua_parse(useragent)['os_family'] = 'Other' ) ) ) csrs LEFT JOIN profile_all ON csrs.wiki_id = profile_all.wiki ), searches AS ( SELECT DISTINCT id, wiki_id, STR_TO_MAP(query_string, '&', '=') AS query_map FROM requests ), exploded_searches AS ( SELECT s.id, s.wiki_id, exp.key AS query_key, exp.val AS query_value FROM searches s LATERAL VIEW EXPLODE(s.query_map) exp AS key, val ), wiki_map AS ( SELECT DISTINCT dbname AS wiki, host_parse(hostname) AS normalized_hostname, CONCAT('ns', namespace) AS ns, IF(namespace = 0, "Article", namespace_canonical_name) AS canonical_namespace FROM mediawiki_project_namespace_map WHERE snapshot = '2017-07' -- Only include namespaces that have an ASCII canonical name (ns0's name is "" instead of "Article" in the db): AND (namespace_canonical_name RLIKE '^[A-Za-z\\s]+$' OR namespace = 0) ) SELECT project, language, id AS search_id, namespace AS namespace_searched FROM ( SELECT es.id AS id, es.wiki_id AS wiki, wm.normalized_hostname.project_class AS project, wm.normalized_hostname.project AS language, wm.canonical_namespace AS namespace FROM exploded_searches es LEFT JOIN wiki_map wm ON es.wiki_id = wm.wiki AND es.query_key = wm.ns ) ns WHERE namespace IS NOT NULL;