QUERY = WITH revert_risks AS ( SELECT rr.wiki_db AS wiki_db, rr.rev_id AS revision_id, rev_revert_risk AS revert_risk_score, rr.rev_timestamp AS edit_timestamp, mwh.event_user_text, mwh.event_user_revision_count AS user_edit_count, -- was it a newly created account. defined as user that created account 90 days before edit CASE WHEN rru.user_id IS NOT NULL AND unix_timestamp(rr.rev_timestamp) - unix_timestamp(rru.user_registration_timestamp) < 7776000 THEN TRUE ELSE FALSE END AS is_new_account, -- was the user anonymous CASE WHEN event_user_is_anonymous THEN TRUE ELSE FALSE END AS is_anon, CASE WHEN SIZE(event_user_is_bot_by_historical) = 0 THEN FALSE ELSE TRUE END AS user_is_bot, -- edit reverts another edit revision_is_identity_revert AS is_revert, rr.page_title, page_namespace_is_content AS namespace_is_content, -- was this edit reverted CASE WHEN rr.rev_is_identity_reverted THEN TRUE ELSE FALSE END AS is_reverted, -- time to revert rr.rev_seconds_to_identity_revert AS time_to_revert, -- was the user extended confirmed CASE WHEN ARRAY_CONTAINS(event_user_groups, 'extendedconfirmed') THEN TRUE ELSE FALSE END AS is_extendedconfirmed, CASE WHEN bc.user_text IS NOT NULL THEN TRUE ELSE FALSE END AS is_blocked, CASE WHEN amr.amr_rev_parent_id IS NOT NULL THEN TRUE ELSE FALSE END AS was_reverted_by_automoderator FROM risk_observatory.revert_risk_predictions rr JOIN wmf.mediawiki_history mwh ON rr.wiki_db = mwh.wiki_db AND rr.rev_id = mwh.revision_id LEFT JOIN wmf_product.automoderator_monitoring_snapshot_daily amr ON rr.rev_id = amr.amr_rev_parent_id -- was reverted by automoderator edits AND is_amr_revert AND amr.wiki_db = 'enwiki' AND amr.snapshot_date = '2025-02-15' LEFT JOIN event.mediawiki_user_blocks_change bc ON rr.user_name = bc.user_text AND rr.wiki_db = bc.`database` AND bc.`database` = 'enwiki' -- user blocked after edit -- AND bc.meta.dt > rr.rev_timestamp -- join to users dataset to identify new users LEFT JOIN mneisler.canonical_revertrisk_users_2024 rru ON mwh.event_user_id = rru.user_id AND mwh.wiki_db = rru.wiki_db WHERE mwh.snapshot = '2024-11' -- limit to indonesian Wikipedia AND rr.wiki_db == 'enwiki' -- limit to only revisions assigned revert risk score AND rev_revert_risk IS NOT NULL -- exclude adminstrators AND ( event_user_groups IS NULL OR NOT ARRAY_CONTAINS(mwh.event_user_groups_historical, 'sysop') ) AND event_timestamp > "2024-01-01 00:00:00" AND event_timestamp < "2024-11-05 00:00:00" ), excl_self_reverts AS ( SELECT rd.* FROM revert_risks rd JOIN wmf.mediawiki_history mwh ON rd.revision_id = mwh.revision_first_identity_reverting_revision_id AND rd.wiki_db = mwh.wiki_db WHERE snapshot = '2024-11' AND rd.is_revert -- exclude self reverts AND NOT rd.event_user_text = mwh.event_user_text ) SELECT * FROM revert_risks WHERE NOT revert_risks.is_revert UNION ALL SELECT * FROM excl_self_reverts ; ============ - enwiki - ============ - Raw data shape: (0, 17) - Duplicate rows found and removed: 0 - Clean data shape: (0, 17) - Unique revision_ids: 0 | Data Shape: 0 | Same? : -> True - Removing edits that are reverts from df | New Shape: (0, 17) - Is any revert_risk_score NA? : False - Is any user_edit_count NA? : False - Is any time_to_revert NA? : False Traceback (most recent call last): File "/srv/home/gkyziridis/revert_risk_threshold_for_all_wikies.py", line 392, in rc = ROC(actual=actual, probs=predicted, plot=True, name=wiki_name) File "/srv/home/gkyziridis/revert_risk_threshold_for_all_wikies.py", line 113, in ROC fpr, tpr, thresholds = roc_curve(actual, probs) File "/home/gkyziridis/.conda/envs/2025-05-08T10.39.42_gkyziridis/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 216, in wrapper return func(*args, **kwargs) File "/home/gkyziridis/.conda/envs/2025-05-08T10.39.42_gkyziridis/lib/python3.10/site-packages/sklearn/metrics/_ranking.py", line 1150, in roc_curve fps, tps, thresholds = _binary_clf_curve( File "/home/gkyziridis/.conda/envs/2025-05-08T10.39.42_gkyziridis/lib/python3.10/site-packages/sklearn/metrics/_ranking.py", line 835, in _binary_clf_curve pos_label = _check_pos_label_consistency(pos_label, y_true) File "/home/gkyziridis/.conda/envs/2025-05-08T10.39.42_gkyziridis/lib/python3.10/site-packages/sklearn/utils/validation.py", line 2641, in _check_pos_label_consistency raise ValueError( ValueError: y_true takes value in {} and pos_label is not specified: either make y_true take value in {0, 1} or {-1, 1} or pass pos_label explicitly.