diff --git a/lib/most-read.js b/lib/most-read.js index 4c64471..d54f878 100644 --- a/lib/most-read.js +++ b/lib/most-read.js @@ -1,202 +1,210 @@ /** * Most-read article promise and related support functions. */ 'use strict'; const BBPromise = require('bluebird'); const regexEscape = require('escape-string-regexp'); const dateUtil = require('./date-util'); const pageviews = require('./pageviews'); const si = require('./siteinfo'); const util = require('./util'); const apiUtil = require('./api-util'); const Title = require('mediawiki-title').Title; const MAX_TITLES = 50; const DENY_LIST = [ '-', 'Test_card', 'Web_scraping', 'XHamster', 'Java_(programming_language)', 'Images/upload/bel.jpg', 'Superintelligence:_Paths,_Dangers,_Strategies', 'Okto', 'Proyecto_40', 'AMGTV', 'Lali_Espósito', 'La7', 'Vagina', 'کس', // mznwiki 'مقعد', // mznwiki 'Tobias_Sammet', // dewiki T238942 'Avantasia', // dewiki T238942 'Edguy', // dewiki T238942 - 'Pornhub' // dewiki T238942 + 'Pornhub', // dewiki T238942 + 'Index', // T327904 + 'Index_(statistics)', + 'Index,_Washington', + 'Index_(economics)', + 'XXX:_Return_of_Xander_Cage', + 'XXX_(film_series)', + 'XXX_(2002_film)', + 'Cookie_(informatique)' ]; /** * @public {!string} date ISO 8601 timestamp of pageviews recorded * @public {!number} views Integer pageviews on date */ class DatedPageviews { constructor(date, views) { this.date = date; this.views = views; } } function getTopPageviews(req, domain, date) { const restReq = { params: { domain: 'wikimedia.org' }, headers: { accept: 'application/json; charset=utf-8' } }; return new pageviews.Client(req, restReq).reqTop(domain, pageviews.Platform.ALL, date); } function pageviewsPageRspToDatedPageviews(rsp) { return rsp.body.items.map((item) => { return new DatedPageviews(dateUtil.iso8601DateFromYYYYMMDD(item.timestamp), item.views); }); } function getViewHistory(req, domain, startDate, endDate, entry) { const restReq = { params: { domain: 'wikimedia.org' }, headers: { accept: 'application/json; charset=utf-8' } }; const client = new pageviews.Client(req, restReq); return client.reqPage(util.removeTLD(domain), pageviews.Platform.ALL, pageviews.Agent.USER, entry.article, pageviews.Granularity.DAILY, startDate, endDate) .then(pageviewsPageRspToDatedPageviews); } function filterSpecial(articles, mainPageTitle) { const mainPageRegExp = new RegExp(`^${regexEscape(mainPageTitle)}$`, 'i'); return articles.filter((entry) => { return entry.namespace.id === 0 && !mainPageRegExp.test(entry.titles.canonical); }); } function isAllowed(title) { return DENY_LIST.indexOf(title) === -1; } function promise(app, req) { if (req.params.domain === 'fy.wikipedia.org') { return BBPromise.resolve({ meta: {} }); } if (!dateUtil.validate(dateUtil.hyphenDelimitedDateString(req))) { if (req.query.aggregated) { return BBPromise.resolve({ meta: {} }); } dateUtil.throwDateError(); } const reqDate = dateUtil.getRequestedDate(req); const rspDate = req.query.aggregated ? dateUtil.addDays(reqDate, -1) : reqDate; return BBPromise.join( getTopPageviews(req, util.removeTLD(req.params.domain), rspDate), si.getSiteInfo(req), (pageviewsResponse, siteinfo) => { const mainPage = siteinfo.general && siteinfo.general.mainpage; const mainPageTitle = Title.newFromText(mainPage, siteinfo); const pageviews = pageviewsResponse && pageviewsResponse.body && pageviewsResponse.body.items && pageviewsResponse.body.items[0]; const pageviewsSlice = pageviews.articles && pageviews.articles.slice(0, MAX_TITLES); const titles = pageviewsSlice.map((entry) => { let ns; try { // eslint-disable-next-line no-underscore-dangle ns = Title.newFromText(entry.article, siteinfo).getNamespace()._id; } catch (e) { if (e.type === 'title-invalid-utf8') { req.logger.log('warn', e); return; } else { throw e; } } return Object.assign({ ns }, entry); }).filter((entry) => entry !== undefined); const resultsDate = `${pageviews.year}-${pageviews.month}-${pageviews.day}Z`; const start = dateUtil.addDays(new Date(resultsDate), -4); const end = new Date(resultsDate); const results = titles.map((entry) => { // Will be merged-in on a later pass entry.$summary = apiUtil.restGetSummary(req, entry.article); entry.view_history = getViewHistory(req, req.params.domain, start, end, entry); delete entry.article; delete entry.ns; return entry; }); return util.promiseAwaitAll(results, true, req.logger) .then((resp) => { let articles = resp // Throw away items where summaries failed to fetch. .filter((entry) => !!entry.$summary) // Merge the summary content into the article view data. .map((entry) => { Object.assign(entry, entry.$summary); delete entry.$summary; return entry; }); // Deduplicate the most-read pages cause some of them might have been // redirects which have now been resolved. articles = util.removeDuplicateTitles(articles, (orig, dupe) => { orig.views += dupe.views; orig.view_history.forEach((toViewsForDate) => { const filteredViews = dupe.view_history.filter((fromViewsForDate) => { return toViewsForDate.date === fromViewsForDate.date; }); if (filteredViews.length) { toViewsForDate.views += filteredViews[0].views; } }); return orig; }); articles = filterSpecial(articles, mainPageTitle.getPrefixedDBKey()); articles = articles.filter((entry) => isAllowed(entry.titles.canonical)); const vary = []; if (siteinfo.variants && siteinfo.variants.length > 1) { vary.push('accept-language'); } return { payload: { date: resultsDate, articles }, meta: { revision: dateUtil.dateStringFrom(req), vary } }; }); }).catch((err) => { // Catch and handle the error if this is an aggregated request and the // pageview data are not yet loaded. if (req.query.aggregated && err.status === 404) { return BBPromise.resolve({ meta: {} }); } throw err; }); } module.exports = { promise, // visible for testing filterSpecial }; diff --git a/test/lib/announcements.js b/test/lib/announcements.js index 8e713ea..786c124 100644 --- a/test/lib/announcements.js +++ b/test/lib/announcements.js @@ -1,242 +1,242 @@ 'use strict'; const domino = require('domino'); const assert = require('../utils/assert'); const mut = require('../../lib/announcements.js'); // module under test const config = require('../../etc/announcements'); const inactiveAnnouncementDomain = 'cs.wikipedia.org'; const activeAnnouncementDomain = 'en.wikipedia.org'; const fundraisingCampaigns = config.campaigns.filter((campaign) => campaign.type == config.AnnouncementType.FUNDRAISING ); const surveyCampaigns = config.campaigns.filter((campaign) => campaign.type == config.AnnouncementType.SURVEY ); const fundraisingAndSurveyCampaigns = config.campaigns.filter((campaign) => campaign.type == config.AnnouncementType.SURVEY || campaign.type == config.AnnouncementType.FUNDRAISING ); describe('lib:announcements', () => { it('should return no announcement for inactive wiki', () => { const res = mut.getAnnouncements(inactiveAnnouncementDomain); assert.ok(res.announce.length === 0); }); it('should return one or more announcements for active wiki', () => { const res = mut.getAnnouncements(activeAnnouncementDomain); - // Updated so that fundraising announcements are checked. - assert.ok(res.announce.length > 0); + // Update when fundraising is active. + // assert.ok(res.announce.length > 0); }); }); describe('lib:announcements:etc', () => { fundraisingCampaigns.forEach((fundraisingCampaign) => { const announcements = mut.testing.getAnnouncementsForCampaign(fundraisingCampaign); it('should return no image_url', () => { announcements.forEach((announcement) => { assert.ok(!announcement.image_url); }); }); it('should return correct type', () => { announcements.forEach((elem) => { assert.ok(elem.type === config.AnnouncementType.FUNDRAISING); }); }); it('countries is an array of strings', () => { announcements.forEach((elem) => { assert.ok(elem.countries.every(value => typeof value === 'string')); }); }); it('should not deliver HTML in certain legacy iOS announcements fields', () => { const doc = domino.createDocument(); const iosAnnouncement = mut.testing.getLegacyiOSFundraisingAnnouncements(fundraisingCampaign)[0]; // destructure 'id', 'text' and 'action.title' from the iOS announcement const { text, action: { title } } = iosAnnouncement; const fieldsToCheck = { text, title }; for (const textOnlyFieldName of Object.keys(fieldsToCheck)) { const textToCheck = fieldsToCheck[textOnlyFieldName]; const element = doc.createElement('div'); element.innerHTML = textToCheck; // Comparing innerHTML and textContent lengths catches even non-tag html, // such as ' '; assert.deepEqual( element.innerHTML.length, element.textContent.length, `iOS does not support HTML in the "${textOnlyFieldName}" field` ); } }); it('iOS legacy fundraising announcement should have the proper platform ID', () => { const announcements = mut.testing.getLegacyiOSFundraisingAnnouncements(fundraisingCampaign); announcements.forEach((announcement) => { assert.ok(announcement.platforms.includes('iOSApp')); assert.ok(!announcement.platforms.includes('iOSAppV2')); assert.ok(!announcement.platforms.includes('iOSAppV3')); assert.ok(!announcement.platforms.includes('iOSAppV4')); }); }); it('iOS fundraising announcement should have the proper platform ID', () => { const announcements = mut.testing.getiOSFundraisingAnnouncements(fundraisingCampaign); announcements.forEach((announcement) => { assert.ok(!announcement.platforms.includes('iOSApp')); assert.ok(announcement.platforms.includes('iOSAppV2')); assert.ok(announcement.platforms.includes('iOSAppV3')); assert.ok(announcement.platforms.includes('iOSAppV4')); }); }); it('should deliver HTML in certain V2 announcements fields', () => { const doc = domino.createDocument(); const v2Announcement = mut.testing.getAndroidFundraisingAnnouncements(fundraisingCampaign)[0]; const { text } = v2Announcement; const fieldsToCheck = { text }; for (const textOnlyFieldName of Object.keys(fieldsToCheck)) { const textToCheck = fieldsToCheck[textOnlyFieldName]; const element = doc.createElement('div'); element.innerHTML = textToCheck; // Looking for
tags assert.ok( element.querySelector('BR'), // eslint-disable-next-line max-len `V2 announcements should have some HTML line breaks in the "${textOnlyFieldName}" field` ); } }); it('caption_HTML on iOS should be inside a paragraph', () => { // eslint-disable-next-line camelcase const { caption_HTML } = mut.testing.getLegacyiOSFundraisingAnnouncements(fundraisingCampaign)[0]; const doc = domino.createDocument(caption_HTML); assert.deepEqual(doc.body.firstElementChild.tagName, 'P'); }); it('caption_HTML on Android should not be inside a paragraph', () => { // eslint-disable-next-line camelcase const { caption_HTML } = mut.testing.getAndroidFundraisingAnnouncements(fundraisingCampaign)[0]; const doc = domino.createDocument(caption_HTML); assert.notDeepEqual(doc.body.firstElementChild.tagName, 'P'); }); }); fundraisingAndSurveyCampaigns.forEach(campaign => { it('buildId should not return lower case characters', () => { const id = mut.testing.buildId(campaign, 'IOS', 'US'); assert.deepEqual(id, id.toUpperCase()); }); }); surveyCampaigns.forEach(campaign => { if (campaign !== 'IOSSURVEY20') { return; } const announcements = mut.testing.getAnnouncementsForCampaign(campaign); it('iOS survey announcement should have at least one normalized string in article titles', () => { announcements.forEach((announcement) => { assert.ok(announcement.articleTitles.length > 0); announcement.articleTitles.forEach((title) => { assert.ok(typeof title == 'string'); assert.ok(!title.includes('_')) }); }); }); it('iOS survey announcement should have a displayDelay number > 0', () => { announcements.forEach((announcement) => { assert.ok(typeof announcement.displayDelay == 'number'); assert.ok(announcement.displayDelay > 0); }); }); it('iOS survey announcement should have the proper platform ID', () => { announcements.forEach((announcement) => { assert.ok(!announcement.platforms.includes('iOSApp')); assert.ok(!announcement.platforms.includes('iOSAppV2')); assert.ok(announcement.platforms.includes('iOSAppV3')); }); }); }); describe('.hasEnded', () => { const fundraisingCampaign = fundraisingCampaigns[0]; let oldEndTime; beforeEach(() => { oldEndTime = fundraisingCampaign.endTime; }); afterEach(() => { fundraisingCampaign.endTime = oldEndTime; }); it('invalid endTime', () => { fundraisingCampaign.endTime = 'INVALID'; assert.throws(() => { mut.testing.hasEnded(fundraisingCampaign, Date(Date.UTC(2030, 5, 1))); }, /config_error/); }); it('endTime has passed', () => { fundraisingCampaign.endTime = '2017-12-20T23:59:00Z'; assert.ok(mut.testing.hasEnded(fundraisingCampaign, new Date(Date.UTC(2017, 11, 21)))); }); it('endTime has not passed yet', () => { fundraisingCampaign.endTime = '2017-12-20T23:59:00Z'; assert.ok(!mut.testing.hasEnded(fundraisingCampaign, new Date(Date.UTC(2017, 11, 20)))); }); }); describe('announcements-unit-config', () => { const THIS_YEAR = new Date().getUTCFullYear(); // Example: '2017-11-30T16:00:00Z' const SIMPLIFIED_ISO8610_REGEX = /^\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d([+-][0-2]\d:[0-5]\d|Z)$/; function checkValidDateStringFormat(dateString, label) { assert.ok(SIMPLIFIED_ISO8610_REGEX.test(dateString), `invalid date string format in ${label}`); } function checkYear(date, label) { const res = date.getUTCFullYear(); assert.ok(THIS_YEAR - 1 < res || res < THIS_YEAR + 1, `invalid year ${res} in ${label}`); } function checkValidDate(date, label) { assert.ok(!isNaN(date.getTime()), `invalid date in ${label}`); } function checkDate(date, dateString, label) { checkValidDateStringFormat(dateString, label); checkValidDate(date, label); checkYear(date, label); } it('all dates should be valid', () => { fundraisingAndSurveyCampaigns.forEach(campaign => { const startDate = new Date(campaign.startTime); const endDate = new Date(campaign.endTime); checkDate(startDate, campaign.startTime, 'startTime'); checkDate(endDate, campaign.endTime, 'endTime'); assert.ok(startDate < endDate, 'endTime should be greater than startTime!'); }); }); }); });