""" ``vandalized_articles_checker -h`` :: "vandalized_articles_checker.py" is a python utility that takes article_titles as command-line arguments then uses the ORES editquality damaging model to return a percentage of damaging edits per article. Usage: vandalized_articles_checker -h | --help vandalized_articles_checker ... Options: -h --help Print this documentation. List of input article titles e.g: "Michael Jordan" "Barack Obama" """ import mwapi import logging.config from docopt import docopt from revscoring import Model from revscoring.extractors import api from tabulate import tabulate from tqdm import tqdm def main(argv=None): """ Parse command-line arguments, analyse each article's edits and print their vandalism percentages in a table. """ cli_args = docopt(__doc__, argv=argv) article_titles = cli_args[""] mwapi_session = mwapi.Session( host="https://en.wikipedia.org", user_agent="vandalized_articles_checker" ) articles_and_vandalism_percentages = articles_vandalism_percentages( article_titles, mwapi_session ) vandalized_articles_table(articles_and_vandalism_percentages) def vandalized_articles_table(articles_and_vandalism_percentages): """ Print tabular data that shows percentages of vandalized articles """ vandalized_articles_table = tabulate( articles_and_vandalism_percentages, headers=["Article", "Percentage"], tablefmt="grid", ) print(vandalized_articles_table) def articles_vandalism_percentages(article_titles, mwapi_session): """ Return articles and their vandalism percentages while printing progress bars as each article is being processed. """ articles_and_vandalism_percentages = [] articles_bar = "progress analysing all articles" article_bar = "analysing edits for '" for article_title in tqdm(article_titles, desc=articles_bar): edits = edit_ids(article_title, mwapi_session) damaging_edits_scores = [ damaging_score(edit_id, mwapi_session) for edit_id in tqdm( edits, desc=article_bar + article_title + "'" ) ] damaging_edits_percentage = percentage_of_damaging_edits( damaging_edits_scores ) articles_and_vandalism_percentages.append( [article_title, damaging_edits_percentage] ) return articles_and_vandalism_percentages def percentage_of_damaging_edits(damaging_edits_scores): """ Return percentage of damaging edits based on damaging edits' scores """ number_of_damaging_edits = damaging_edits_scores.count(True) number_of_edits = len(damaging_edits_scores) percentage_of_damaging_edits = ( number_of_damaging_edits / number_of_edits ) * 100 return percentage_of_damaging_edits def edit_ids(article_title, mwapi_session): """ Return edit ids based on article title and number_of_edits_limit """ number_of_edits_limit = "100" # MediaWiki API has an rvlimit: 1 - 500 mwapi_response = mwapi_session.get( action="query", prop="revisions", titles=article_title, rvlimit=number_of_edits_limit, rvprop="ids|timestamp|user", rvslots="main", formatversion="2", format="json", ) edits = mwapi_response["query"]["pages"][0]["revisions"] edit_ids = [edit["revid"] for edit in edits] return edit_ids def damaging_score(edit_id, mwapi_session): """ Return damaging score for an edit id """ disable_logging(True) with open("enwiki.damaging.gradient_boosting.model") as f: model = Model.load(f) disable_logging(False) extractor = api.Extractor(mwapi_session) damaging_score = True try: values = extractor.extract(edit_id, model.features) model_score = model.score(values) damaging_score = model_score["prediction"] except Exception: # handling revscoring deleted revisions pass return damaging_score def disable_logging(config): """ Enable or diable logging """ logging.config.dictConfig({ "version": 1, "disable_existing_loggers": config }) if __name__ == "__main__": main()