diff --git a/tests/roundtrip-test.js b/tests/roundtrip-test.js index 62a5ff84f..4c8df066f 100755 --- a/tests/roundtrip-test.js +++ b/tests/roundtrip-test.js @@ -1,696 +1,732 @@ #!/usr/bin/env node "use strict"; require( '../lib/core-upgrade.js' ); var request = require( 'request' ), yargs = require( 'yargs' ), domino = require( 'domino' ), url = require( 'url' ), zlib = require( 'zlib' ), JSUtils = require( '../lib/jsutils.js' ).JSUtils, Util = require( '../lib/mediawiki.Util.js' ).Util, DU = require( '../lib/mediawiki.DOMUtils.js' ).DOMUtils, TemplateRequest = require( '../lib/mediawiki.ApiRequest.js' ).TemplateRequest, ParsoidConfig = require( '../lib/mediawiki.ParsoidConfig' ).ParsoidConfig, MWParserEnvironment = require( '../lib/mediawiki.parser.environment.js' ).MWParserEnvironment, Diff = require('../lib/mediawiki.Diff.js').Diff; var plainCallback = function ( env, err, results ) { var i, result, output = '', semanticDiffs = 0, syntacticDiffs = 0, testDivider = ( new Array( 70 ) ).join( '=' ) + '\n', diffDivider = ( new Array( 70 ) ).join( '-' ) + '\n'; if ( err ) { output += 'Parser failure!\n\n'; output += diffDivider; output += err; } else { for ( i = 0; i < results.length; i++ ) { result = results[i]; output += testDivider; if ( result.type === 'fail' ) { output += 'Semantic difference' + (result.selser ? ' (selser)' : '') + ':\n\n'; output += result.wtDiff + '\n'; output += diffDivider + 'HTML diff:\n\n' + result.htmlDiff + '\n'; semanticDiffs++; } else { output += 'Syntactic difference' + (result.selser ? ' (selser)' : '') + ':\n\n'; output += result.wtDiff + '\n'; syntacticDiffs++; } } output += testDivider; output += testDivider; output += "SUMMARY:\n"; output += "Semantic differences : " + semanticDiffs + "\n"; output += "Syntactic differences: " + syntacticDiffs + "\n"; output += diffDivider; output += "ALL differences : " + (semanticDiffs + syntacticDiffs) + "\n"; output += testDivider; output += testDivider; } return output; }; var encodeXmlEntities = function( str ) { return str.replace( /&/g, '&' ) .replace( //g, '>' ); }; function encodeAttribute (str) { return encodeXmlEntities(str) .replace(/"/g, '"'); } var xmlCallback = function ( env, err, results ) { var i, result; var prefix = ( env && env.conf && env.conf.wiki && env.conf.wiki.iwp ) || ''; var title = ( env && env.page && env.page.name ) || ''; var output = '\n'; var outputTestSuite = function (selser) { output += '\n'; }; if ( err ) { outputTestSuite(false); output += ''; output += encodeXmlEntities( err.stack || err.toString() ); output += ''; } else if (!results.length) { outputTestSuite(false); } else { var currentSelser = results[0].selser; outputTestSuite(currentSelser); for ( i = 0; i < results.length; i++ ) { result = results[i]; // When going from normal to selser results, switch to a new // test suite. if (currentSelser !== result.selser) { output += '\n'; currentSelser = result.selser; outputTestSuite(currentSelser); } output += '\n'; if ( result.type === 'fail' ) { output += '\n'; output += '\n'; output += encodeXmlEntities( result.wtDiff ); output += '\n\n'; output += '\n'; output += encodeXmlEntities( result.htmlDiff ); output += '\n\n'; output += '\n'; } else { output += '\n'; output += encodeXmlEntities( result.wtDiff ); output += '\n\n'; } output += '\n'; } } output += '\n'; // Output the profiling data if ( env.profile ) { // Delete the total timer to avoid serializing it if (env.profile.time && env.profile.time.total_timer) { delete( env.profile.time.total_timer ); } output += '\n'; for ( var type in env.profile ) { for ( var prop in env.profile[ type ] ) { output += ''; output += DU.encodeXml( env.profile[ type ][ prop ].toString() ); output += '\n'; } } output += '\n'; } output += ''; return output; }; var findMatchingNodes = function (root, targetRange, sourceLen) { var currentOffset = null, wasWaiting = false, waitingForEndMatch = false; function walkDOM(element) { var elements = [], precedingNodes = [], attribs = DU.getJSONAttribute(element, 'data-parsoid'); if ( attribs.dsr && attribs.dsr.length ) { var start = attribs.dsr[0] || 0, end = attribs.dsr[1] || sourceLen - 1; if ( (targetRange.end - 1) < start || targetRange.start > (end - 1) ) { return null; } if ( waitingForEndMatch ) { if ( end >= targetRange.end ) { waitingForEndMatch = false; } return { done: true, nodes: [element] }; } if ( attribs.dsr[0] !== null && targetRange.start === start && end === targetRange.end ) { return { done: true, nodes: [element] }; } else if ( targetRange.start === start ) { waitingForEndMatch = true; if (end < targetRange.end) { // No need to walk children return { done: false, nodes: [element] }; } } else if (start > targetRange.start && end < targetRange.end) { // No need to walk children return { done: false, nodes: [element] }; } } var c = element.firstChild; while (c) { wasWaiting = waitingForEndMatch; if ( DU.isElt(c) ) { var res = walkDOM(c); var matchedChildren = res ? res.nodes : null; if ( matchedChildren ) { if ( !currentOffset && attribs.dsr && (attribs.dsr[0] !== null) ) { var elesOnOffset = []; currentOffset = attribs.dsr[0]; // Walk the preceding nodes without dsr values and prefix matchedChildren // till we get the desired matching start value. var diff = currentOffset - targetRange.start; while ( precedingNodes.length > 0 && diff > 0 ) { var n = precedingNodes.pop(); var len = n.nodeValue.length + (n.nodeType === c.COMMENT_NODE ? 7 : 0); if ( len > diff ) { break; } diff -= len; elesOnOffset.push( n ); } elesOnOffset.reverse(); matchedChildren = elesOnOffset.concat( matchedChildren ); } // Check if there's only one child, and make sure it's a node with getAttribute if ( matchedChildren.length === 1 && DU.isElt(matchedChildren[0]) ) { var childAttribs = matchedChildren[0].getAttribute( 'data-parsoid' ); if ( childAttribs ) { childAttribs = JSON.parse( childAttribs ); if ( childAttribs.dsr && childAttribs.dsr[1]) { if ( childAttribs.dsr[1] >= targetRange.end ) { res.done = true; } else { currentOffset = childAttribs.dsr[1]; } } } } if (res.done) { res.nodes = matchedChildren; return res; } else { elements = matchedChildren; } } else if (wasWaiting || waitingForEndMatch) { elements.push(c); } // Clear out when an element node is encountered. precedingNodes = []; } else if ( c.nodeType === c.TEXT_NODE || c.nodeType === c.COMMENT_NODE ) { if ( currentOffset && ( currentOffset < targetRange.end ) ) { currentOffset += c.nodeValue.length; if ( c.nodeType === c.COMMENT_NODE ) { // Add the length of the ' bits currentOffset += 7; } if ( currentOffset >= targetRange.end ) { waitingForEndMatch = false; } } if (wasWaiting || waitingForEndMatch) { // Part of target range elements.push(c); } else if ( !currentOffset ) { // Accumulate nodes without dsr precedingNodes.push( c ); } } if ( wasWaiting && !waitingForEndMatch ) { break; } // Skip over encapsulated content var typeOf = DU.isElt(c) ? c.getAttribute( 'typeof' ) || '' : ''; if (/\bmw:(?:Transclusion\b|Param\b|Extension\/[^\s]+)/.test(typeOf)) { c = DU.skipOverEncapsulatedContent(c); } else { c = c.nextSibling; } } var numElements = elements.length; var numChildren = element.childNodes.length; if (numElements === 0) { return null; } else if ( numElements < numChildren ) { return { done: !waitingForEndMatch, nodes: elements } ; } else { /* numElements === numChildren */ return { done: !waitingForEndMatch, nodes: [element] } ; } } return walkDOM(root); }; var checkIfSignificant = function ( env, offsets, src, body, out, cb, document ) { var normalizeWikitext = function ( str ) { // Ignore leading tabs vs. leading spaces str = str.replace(/^\t/, ' '); str = str.replace(/\n\t/g, '\n '); // Normalize multiple spaces to single space str = str.replace(/ +/g, " "); // Eliminate spaces around wikitext chars // gwicke: disabled for now- too aggressive IMO //str = str.replace(/([<"'!#\*:;+-=|{}\[\]\/]) /g, "$1"); // Ignore capitalization of tags and void tag indications str = str.replace(/<(\/?)([^ >\/]+)((?:[^>\/]|\/(?!>))*)\/?>/g, function(match, close, name, remaining) { return '<' + close + name.toLowerCase() + remaining.replace(/ $/, '') + '>'; } ); // Ignore whitespace in table cell attributes str = str.replace(/(^|\n|\|(?=\|)|!(?=!))(\{\||\|[\-+]*|!) *([^|\n]*?) *(?=[|\n]|$)/g, '$1$2$3'); // Ignore trailing semicolons and spaces in style attributes str = str.replace(/style\s*=\s*"[^"]+"/g, function(match) { return match.replace(/\s|;(?=")/g, ''); }); // Strip double-quotes str = str.replace(/"([^"]*?)"/g, "$1"); // Ignore implicit and in table cells or the end // of the string for now str = str.replace(/(^|\n)<\/(?:small|center)>(?=\n[|!]|\n?$)/g, ''); str = str.replace(/([|!].*?)<\/(?:small|center)>(?=\n[|!]|\n?$)/gi, '$1'); return str; }; var i, k, diff, offset, origOut, newOut, origHTML, newHTML, origOrigHTML, origNewHTML, thisResult, results = []; for ( i = 0; i < offsets.length; i++ ) { thisResult = {}; origOrigHTML = ''; origNewHTML = ''; offset = offsets[i]; thisResult.offset = offset; // console.warn("--processing: " + JSON.stringify(offset)); if ( offset[0].start === offset[0].end && out.substr(offset[1].start, offset[1].end - offset[1].start) .match(/^\n?<\/[^>]+>\n?$/) ) { // An element was implicitly closed. Fudge the orig offset // slightly so it finds the corresponding elements which have the // original (unclosed) DSR. offset[0].start--; } // console.warn("--orig--"); var res = findMatchingNodes( body, offset[0] || {}, src.length); origOut = res ? res.nodes : []; for ( k = 0; k < origOut.length; k++ ) { // node need not be an element always! origOrigHTML += DU.serializeNode(origOut[k], {smartQuote: false}); } origHTML = DU.formatHTML( DU.normalizeOut( origOrigHTML ) ); // console.warn("# nodes: " + origOut.length); // console.warn("html: " + origHTML); // console.warn("--new--"); res = findMatchingNodes( document.body, offset[1] || {}, out.length); newOut = res ? res.nodes : []; for ( k = 0; k < newOut.length; k++ ) { // node need not be an element always! origNewHTML += DU.serializeNode(newOut[k], {smartQuote: false}); } newHTML = DU.formatHTML( DU.normalizeOut( origNewHTML ) ); // console.warn("# nodes: " + newOut.length); // console.warn("html: " + newHTML); // compute wt diffs var wt1 = src.substring( offset[0].start, offset[0].end ); var wt2 = out.substring( offset[1].start, offset[1].end ); //thisResult.wtDiff = Util.contextDiff(wt1, wt2, false, true, true); // Get diff substrings from offsets /* jshint loopfunc: true */ // this function doesn't use loop variables var formatDiff = function ( offset, context ) { return [ '----', src.substring(offset[0].start - context, offset[0].end + context), '++++', out.substring(offset[1].start - context, offset[1].end + context) ].join('\n'); }.bind( this, offset ); diff = Diff.htmlDiff( origHTML, newHTML, false, true, true ); // No context by default thisResult.wtDiff = formatDiff(0); // Normalize wts to check if we really have a semantic diff thisResult.type = 'skip'; if (diff.length > 0) { var normWT1 = normalizeWikitext(wt1), normWT2 = normalizeWikitext(wt2); if ( normWT1 !== normWT2 ) { //console.log( 'normDiff: =======\n' + normWT1 + '\n--------\n' + normWT2); thisResult.htmlDiff = diff; thisResult.type = 'fail'; // Provide context for semantic diffs thisResult.wtDiff = formatDiff(25); } } results.push( thisResult ); } cb( null, env, results ); }; var doubleRoundtripDiff = function ( env, offsets, src, body, out, cb ) { if ( offsets.length > 0 ) { env.setPageSrcInfo( out ); env.errCB = function ( error ) { cb( error, env, [] ); process.exit( 1 ); }; var parserPipeline = env.pipelineFactory.getPipeline('text/x-mediawiki/full'); parserPipeline.once( 'document', checkIfSignificant.bind( null, env, offsets, src, body, out, cb ) ); parserPipeline.processToplevelDoc( out ); } else { cb( null, env, [] ); } }; -var parsoidPost = function (env, parsoidURL, prefix, title, text, oldid, +var parsoidPost = function (env, uri, domain, title, text, dp, oldid, recordSizes, profilePrefix, cb) { var data = {}; - if ( oldid ) { - data.oldid = oldid; - data.html = text; - } else { - data.wt = text; + // make sure the Parsoid URI ends on / + if ( !/\/$/.test(uri) ) { + uri += '/'; } + uri += 'v2/' + domain + '/'; + title = encodeURIComponent(title); - // make sure parsoidURL ends on / - if ( !/\/$/.test(parsoidURL) ) { - parsoidURL += '/'; + if ( oldid ) { + // We want html2wt + uri += 'wt/' + title + '/' + oldid; + data.html = { + body: text + }; + data.original = { + 'data-parsoid': dp + }; + } else { + // We want wt2html + uri += 'pagebundle/' + title; + data.wikitext = text; } var options = { - uri: parsoidURL + prefix + '/' + encodeURI(title), + uri: uri, method: 'POST', - headers: { - 'Content-Type': 'application/x-www-form-urlencoded', - }, - encoding: 'utf8', - form: data + json: true, + body: data }; Util.retryingHTTPRequest( 10, options, function( err, res, body ) { if (err) { cb( err, null ); } else if (res.statusCode !== 200) { cb(res.body, null); } else { + var resBody, resDP; + if (oldid) { + // Extract the wikitext from the response + resBody = body.wikitext.body; + } else { + resBody = body.html.body; + resDP = body['data-parsoid']; + } if ( env.profile ) { if (!profilePrefix) { profilePrefix = ''; } // FIXME: Parse time was removed from profiling when we stopped // sending the x-parsoid-performance header. if (recordSizes) { // Record the sizes var sizePrefix = profilePrefix + (oldid ? 'wt' : 'html'); env.profile.size[ sizePrefix + 'raw' ] = - body.length; + resBody.length; // Compress to record the gzipped size - zlib.gzip( res.body, function( err, gzippedbuf ) { + zlib.gzip( resBody, function( err, gzippedbuf ) { if ( !err ) { env.profile.size[ sizePrefix + 'gzip' ] = gzippedbuf.length; } - cb( null, body ); + cb( null, resBody, resDP ); } ); } else { - cb(null, body); + cb(null, resBody, resDP); } } else { - cb( null, body ); + cb( null, resBody, resDP ); } } } ); }; var roundTripDiff = function ( env, src, html, out, cb ) { var diff, offsetPairs; try { diff = Diff.diffLines(out, src); offsetPairs = Diff.convertDiffToOffsetPairs(diff); if ( diff.length > 0 ) { var body = domino.createDocument( html ).body; doubleRoundtripDiff( env, offsetPairs, src, body, out, cb ); } else { cb( null, env, [] ); } } catch ( e ) { cb( e, env, [] ); } }; var selserRoundTripDiff = function (env, html, out, diffs, cb) { var selserDiff, offsetPairs, src = env.page.src.replace(/\n(?=\n)/g, '\n '); // Remove the selser trigger comment out = out.replace(/\n*$/, ''); out = out.replace(/\n(?=\n)/g, '\n '); roundTripDiff(env, src, html, out, function (err, env, selserDiffs) { if (err) { cb(err, env, diffs); } else { for (var sD in selserDiffs) { selserDiffs[sD].selser = true; } if (selserDiffs.length) { diffs = diffs.concat(selserDiffs); } cb(null, env, diffs); } }); }; // Returns a Promise for an { env, rtDiffs } object. `cb` is optional. var fetch = function ( page, options, cb ) { cb = JSUtils.mkPromised( cb, [ 'env', 'rtDiffs' ] ); - var prefix = options.prefix || 'enwiki'; + var domain, prefix, apiURL, + // options are ParsoidConfig options if module.parent, otherwise they + // are CLI options (so use the Util.set* helpers to process them) + parsoidConfig = new ParsoidConfig( module.parent ? options : null ); + if (!module.parent) { + // only process CLI flags if we're running as a CLI program. + Util.setTemplatingAndProcessingFlags( parsoidConfig, options ); + Util.setDebuggingFlags( parsoidConfig, options ); + } if ( options.apiURL ) { - prefix = 'customwiki'; + parsoidConfig.setInterwiki(options.prefix || 'localhost', options.apiURL); + } + if (options.prefix) { + // If prefix is present, use that. + prefix = options.prefix; + // Get the domain from the interwiki map. + apiURL = parsoidConfig.interwikiMap.get(prefix); + if (!apiURL) { + cb("Couldn't find the domain for prefix " + prefix, null, []); + } + domain = url.parse(apiURL).hostname; + } else if (options.domain) { + domain = options.domain; + prefix = parsoidConfig.reverseIWMap.get(domain); } var envCb = function ( err, env ) { env.errCB = function ( error ) { cb( error, env, [] ); }; if ( err !== null ) { env.errCB( err ); return; } env.profile = { time: { total: 0, total_timer: new Date() }, size: {} }; var target = env.resolveTitle( env.normalizeTitle( env.page.name ), '' ); var tpr = new TemplateRequest( env, target, null ); tpr.once( 'src', function ( err, src_and_metadata ) { if ( err ) { cb( err, env, [] ); } else { // Shortcut for calling parsoidPost with common options - var parsoidPostShort = function (postBody, postOldId, + var parsoidPostShort = function (postBody, postDp, postOldId, postRecordSizes, postProfilePrefix, postCb) { - parsoidPost(env, options.parsoidURL, prefix, page, - postBody, postOldId, postRecordSizes, postProfilePrefix, - function (err, postResult) { + parsoidPost(env, options.parsoidURL, domain, page, + postBody, postDp, postOldId, postRecordSizes, postProfilePrefix, + function (err, postResult, postResultDp) { if (err) { cb(err, env, []); } else { - postCb(postResult); + postCb(postResult, postResultDp); } }); }; // Once we have the diffs between the round-tripped wt, // to test rt selser we need to modify the HTML and request // the wt again to compare with selser, and then concat the // resulting diffs to the ones we got from basic rt - var rtSelserTest = function (origHTMLBody, err, env, rtDiffs) { + var rtSelserTest = function (origHTMLBody, origDp, err, env, rtDiffs) { if (err) { cb(err, env, rtDiffs); } else { var newDocument = DU.parseHTML(origHTMLBody), newNode = newDocument.createComment('rtSelserEditTestComment'); newDocument.body.appendChild(newNode); - parsoidPostShort(newDocument.outerHTML, + parsoidPostShort(newDocument.outerHTML, origDp, src_and_metadata.revision.revid, false, 'selser', function (wtSelserBody) { // Finish the total time now if ( env.profile && env.profile.time ) { env.profile.time.total += new Date() - env.profile.time.total_timer; } selserRoundTripDiff(env, origHTMLBody, wtSelserBody, rtDiffs, cb); }); } }; env.setPageSrcInfo(src_and_metadata); // First, fetch the HTML for the requested page's wikitext - parsoidPostShort(env.page.src, null, true, null, function (htmlBody) { + parsoidPostShort(env.page.src, null, null, true, null, function (htmlBody, htmlDp) { // Now, request the wikitext for the obtained HTML - parsoidPostShort(htmlBody, + // (without sending data-parsoid, as we don't want selser yet). + parsoidPostShort(htmlBody, htmlDp, src_and_metadata.revision.revid, true, null, function (wtBody) { roundTripDiff(env, env.page.src, htmlBody, wtBody, - rtSelserTest.bind(null, htmlBody)); + rtSelserTest.bind(null, htmlBody, htmlDp)); }); }); } } ); }; - // options are ParsoidConfig options if module.parent, otherwise they - // are CLI options (so use the Util.set* helpers to process them) - var parsoidConfig = new ParsoidConfig( module.parent ? options : null, { defaultWiki: prefix } ); - if (!module.parent) { - // only process CLI flags if we're running as a CLI program. - Util.setTemplatingAndProcessingFlags( parsoidConfig, options ); - Util.setDebuggingFlags( parsoidConfig, options ); - } - MWParserEnvironment.getParserEnv( parsoidConfig, null, { prefix: prefix, pageName: page }, envCb ); return cb.promise; }; var cbCombinator = function ( formatter, cb, err, env, text ) { cb( err, formatter( env, err, text ) ); }; var consoleOut = function ( err, output ) { if ( err ) { console.log( 'ERROR: ' + err); if (err.stack) { console.log( 'Stack trace: ' + err.stack); } process.exit( 1 ); } else { console.log( output ); process.exit( 0 ); } }; if ( typeof module === 'object' ) { module.exports.fetch = fetch; module.exports.plainFormat = plainCallback; module.exports.xmlFormat = xmlCallback; module.exports.cbCombinator = cbCombinator; } if ( !module.parent ) { var standardOpts = Util.addStandardOptions({ 'xml': { description: 'Use xml callback', 'boolean': true, 'default': false }, 'prefix': { description: 'Which wiki prefix to use; e.g. "enwiki" for English wikipedia, "eswiki" for Spanish, "mediawikiwiki" for mediawiki.org', 'default': '' }, + 'domain': { + description: 'Which wiki to use; e.g. "en.wikipedia.org" for English wikipedia', + 'default': 'en.wikipedia.org' + }, 'parsoidURL': { description: 'The URL for the Parsoid API', } }, { // defaults for standard options rtTestMode: true // suppress noise by default }); var opts = yargs.usage( 'Usage: $0 [options] \n\n', standardOpts ).check(Util.checkUnknownArgs.bind(null, standardOpts)); var callback; var argv = opts.argv; var title = argv._[0]; if ( title ) { callback = cbCombinator.bind( null, Util.booleanOption( argv.xml ) ? xmlCallback : plainCallback, consoleOut ); if ( !argv.parsoidURL ) { // Start our own Parsoid server // TODO: This will not be necessary once we have a top-level testing // script that takes care of setting everything up. - var apiServer = require( './apiServer.js' ); - apiServer.startParsoidServer({ quiet: true }).then(function( ret ) { + var apiServer = require( './apiServer.js' ), + parsoidOptions = {quiet: true}; + if (opts.apiURL) { + parsoidOptions.mockUrl = opts.apiURL; + } + apiServer.startParsoidServer(parsoidOptions).then(function( ret ) { argv.parsoidURL = ret.url; fetch( title, argv, callback ); } ).done(); apiServer.exitOnProcessTerm(); } else { fetch( title, argv, callback ); } } else { opts.showHelp(); } }