diff --git a/dbreps2/src/enwiki/featuredbysize.rs b/dbreps2/src/enwiki/featuredbysize.rs index 1ec65f1..d2653de 100644 --- a/dbreps2/src/enwiki/featuredbysize.rs +++ b/dbreps2/src/enwiki/featuredbysize.rs @@ -1,108 +1,104 @@ /* Copyright 2023 Kunal Mehta This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ use anyhow::Result; use dbreps2::{str_vec, Frequency, Report}; use mwbot::Bot; use mysql_async::prelude::*; use mysql_async::Conn; use wikipedia_prosesize::prosesize; pub struct Row { title: String, prose_size: u64, word_count: u64, } pub struct FeaturedBySize { pub(crate) bot: Bot, } #[async_trait::async_trait] impl Report for FeaturedBySize { fn title(&self) -> &'static str { "Featured articles by size" } fn frequency(&self) -> Frequency { Frequency::Weekly } - fn static_row_numbers(&self) -> bool { - true - } - fn query(&self) -> &'static str { r#" /* featuredbysize.rs SLOW_OK */ SELECT page_title FROM page JOIN categorylinks ON cl_from = page_id WHERE cl_to = "Featured_articles" AND page_namespace = 0 "# } async fn run_query(&self, conn: &mut Conn) -> Result> { let pages: Vec = conn.query(self.query()).await?; let mut rows = vec![]; let mut handles = vec![]; for title in pages { let page = self.bot.page(&title)?; handles.push(tokio::spawn(async move { let html = page.html().await?; let size = prosesize(html); Result::<_, anyhow::Error>::Ok((title, size)) })); } for handle in handles { let (title, size) = handle.await??; println!("{title}"); rows.push(Row { title, prose_size: size.prose_size(), word_count: size.word_count(), }) } rows.sort_by_key(|row| row.prose_size); rows.reverse(); Ok(rows) } fn intro(&self) -> &'static str { "Articles in [[:Category:Featured articles]] sorted by prose size" } fn headings(&self) -> Vec<&'static str> { vec!["Page", "Prose size", "Word count"] } fn format_row(&self, row: &Row) -> Vec { str_vec![ format!("[[{}]]", row.title.replace('_', " ")), row.prose_size, row.word_count ] } fn code(&self) -> &'static str { include_str!("featuredbysize.rs") } } diff --git a/dbreps2/src/general/ownerlessuserpages.rs b/dbreps2/src/general/ownerlessuserpages.rs index 6f4c20a..416d94b 100644 --- a/dbreps2/src/general/ownerlessuserpages.rs +++ b/dbreps2/src/general/ownerlessuserpages.rs @@ -1,183 +1,179 @@ /* Copyright 2008 bjweeks, MZMcBride Copyright 2022 Kunal Mehta This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ use anyhow::Result; use dbreps2::{linker, str_vec, Frequency, Report}; use mysql_async::prelude::*; use mysql_async::Conn; struct FirstRow { page_id: u64, page_namespace: u32, page_title: String, page_len: u64, } struct SecondRow { rev_timestamp: String, actor_name: String, } pub struct Row { page_namespace: u32, page_title: String, length: u64, creator: String, creation_date: String, } async fn user_exists_globally(ca_conn: &mut Conn, name: &str) -> Result { let row: Option = ca_conn .exec_first( r#" SELECT 1 FROM globaluser WHERE gu_name = ? "#, (name,), ) .await?; Ok(row.is_some()) } async fn lookup_revision(conn: &mut Conn, row: &FirstRow) -> Result { Ok(conn .exec_map( r#" SELECT rev_timestamp, actor_name FROM page JOIN revision ON page_id = rev_page JOIN actor ON rev_actor = actor_id WHERE page_id = ? ORDER BY rev_timestamp ASC LIMIT 1; "#, (row.page_id,), |(rev_timestamp, actor_name)| SecondRow { rev_timestamp, actor_name, }, ) .await? .into_iter() .next() .unwrap()) } pub struct Ownerlessuserpages {} #[async_trait::async_trait] impl Report for Ownerlessuserpages { fn title(&self) -> &'static str { "Ownerless pages in the user space" } fn frequency(&self) -> Frequency { Frequency::Daily } - fn static_row_numbers(&self) -> bool { - true - } - fn query(&self) -> &'static str { r" /* ownerlessuserpages.rs SLOW_OK */ SELECT page_id, page_namespace, page_title, page_len FROM page LEFT JOIN user ON user_name = REPLACE(SUBSTRING_INDEX(page_title, '/', 1), '_', ' ') WHERE page_namespace IN (2, 3) AND page_is_redirect = 0 AND NOT IS_IPV4(SUBSTRING_INDEX(page_title, '/', 1)) AND NOT IS_IPV6(SUBSTRING_INDEX(page_title, '/', 1)) AND page_title NOT RLIKE '(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)' AND ISNULL(user_name); " } async fn run_query(&self, conn: &mut Conn) -> Result> { let rows = conn .query_map( self.query(), |(page_id, page_namespace, page_title, page_len)| FirstRow { page_id, page_namespace, page_title, page_len, }, ) .await?; let ca_pool = self.centralauth()?; let mut ca_conn = ca_pool.get_conn().await?; let mut last = vec![]; for row in rows { let username = row.page_title.replace('_', " "); let username = if username.contains('/') { let (username, _) = username.split_once('/').unwrap(); username.to_string() } else { username }; if user_exists_globally(&mut ca_conn, &username).await? { continue; } let rev = lookup_revision(conn, &row).await?; last.push(Row { page_namespace: row.page_namespace, page_title: row.page_title, length: row.page_len, creator: rev.actor_name, creation_date: rev.rev_timestamp, }) } Ok(last) } fn intro(&self) -> &'static str { "Pages in the user space that do not belong to a [[Special:ListUsers|registered user]]" } fn headings(&self) -> Vec<&'static str> { vec!["Page", "Length", "Creator", "Creation date"] } fn format_row(&self, row: &Row) -> Vec { str_vec![ linker(row.page_namespace, &row.page_title), row.length, row.creator, row.creation_date ] } fn code(&self) -> &'static str { include_str!("ownerlessuserpages.rs") } } diff --git a/dbreps2/src/lib.rs b/dbreps2/src/lib.rs index fde7bd3..48bc7ee 100644 --- a/dbreps2/src/lib.rs +++ b/dbreps2/src/lib.rs @@ -1,517 +1,518 @@ /* Copyright 2008 bjweeks, MZMcBride Copyright 2021 Kunal Mehta This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ use anyhow::Result; use log::{error, info}; use mwbot::{Bot, Page, SaveOptions}; use mysql_async::{Conn, Pool}; use regex::Regex; use std::fmt::{Display, Formatter}; use time::format_description::FormatItem; use time::macros::format_description; use time::{Duration, OffsetDateTime, PrimitiveDateTime}; use tokio::fs; mod config; #[macro_export] macro_rules! str_vec { ( $( $item:expr ),* ) => { { let mut temp_vec = Vec::new(); $( temp_vec.push($item.to_string()); )* temp_vec } }; } const SIG_TIMESTAMP: &[FormatItem] = format_description!( "[hour]:[minute], [day padding:none] [month repr:long] [year] (UTC)" ); const DB_TIMESTAMP: &[FormatItem] = format_description!("[year][month][day][hour][minute][second]"); const Y_M_D_TIMESTAMP: &[FormatItem] = format_description!("[year]-[month]-[day]"); const INDEX_WIKITEXT: &str = r#"{{DBR index}} {{DBR footer}} "#; const BLANK_WIKITEXT: &str = r#"{{intentionally blank}}"#; pub async fn load_config() -> Result { let path = dirs::home_dir().unwrap().join(".dbreps.toml"); let contents = fs::read_to_string(path).await?; Ok(toml::from_str(&contents)?) } async fn save_page(page: Page, text: String) -> Result<()> { info!("Updating [[{}]]", page.title()); info!("{}", &text); page.save(text, &SaveOptions::summary("Bot: updating database report")) .await?; Ok(()) } pub enum Frequency { Daily, /// Daily, but at the specific hour too DailyAt(u8), Weekly, Fortnightly, Monthly, } impl Frequency { fn to_duration(&self) -> Duration { match &self { Frequency::Daily | Frequency::DailyAt(_) => Duration::days(1), Frequency::Weekly => Duration::weeks(1), Frequency::Fortnightly => Duration::weeks(2), Frequency::Monthly => Duration::weeks(4), } } fn at_hour(&self) -> Option { if let Frequency::DailyAt(hour) = &self { Some(*hour) } else { None } } } impl Display for Frequency { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self.to_duration().whole_days() { // every day 1 => write!(f, "This report is updated every day")?, // every X days num => write!(f, "This report is updated every {num} days")?, }; match self.at_hour() { Some(hour) => write!(f, " at {hour}:00 UTC."), None => write!(f, "."), } } } #[async_trait::async_trait] pub trait Report { // TODO: Make this per-wiki/language fn title(&self) -> &'static str; fn get_title(&self) -> String { format!("Project:Database reports/{}", self.title()) } fn frequency(&self) -> Frequency; fn rows_per_page(&self) -> Option { None } fn static_row_numbers(&self) -> bool { - false + // if this is a single page report (no rows per page) we use static row numbers + self.rows_per_page().is_none() } fn enumerate(&self) -> bool { !self.static_row_numbers() } fn query(&self) -> &'static str; async fn run_query(&self, conn: &mut Conn) -> Result>; fn intro(&self) -> &'static str { self.title() } fn headings(&self) -> Vec<&'static str>; fn format_row(&self, row: &T) -> Vec; fn code(&self) -> &'static str; fn get_intro(&self, _index: usize) -> String { // TODO: is replag something we still need to care about? meh let mut intro = vec![format!( "{}; data as of ~~~~~. {}\n", self.intro(), self.frequency() )]; if self.static_row_numbers() { intro.push("{{static row numbers}}".to_string()); } let mut classes = vec!["wikitable", "sortable"]; if self.static_row_numbers() { classes.extend(["static-row-numbers", "static-row-header-text"]); } intro.push(format!( r#"{{| class="{}" |- style="white-space: nowrap;""#, classes.join(" ") )); if self.enumerate() { intro.push("! No.".to_string()); } for heading in self.headings() { intro.push(format!("! {heading}")); } intro.join("\n") } fn get_footer(&self) -> String { "|-\n|}\n{{DBR footer}}\n".to_string() } fn needs_update(&self, old_text: &str) -> Result { if let Some(hour) = self.frequency().at_hour() { // If we are supposed to run at a specific time // and it is that time, then run! if OffsetDateTime::now_utc().hour() == hour { return Ok(true); } } let re = Regex::new("(.*?)").unwrap(); let ts = match re.captures(old_text) { Some(cap) => cap[1].to_string(), None => { // No match, it needs an update! return Ok(true); } }; let dt = PrimitiveDateTime::parse(&ts, &SIG_TIMESTAMP)?.assume_utc(); let now = OffsetDateTime::now_utc(); let skew = Duration::minutes(20); if (dt + self.frequency().to_duration() - skew) < now { Ok(true) } else { Ok(false) } } fn build_page(&self, rows: &[T], index: usize) -> String { // The first row starts at the # of previous pages times rows per page let mut row_num = (index - 1) * self.rows_per_page().unwrap_or(0); let mut text = vec![self.get_intro(index)]; for row in rows { row_num += 1; text.push("|-".to_string()); if self.enumerate() { text.push(format!("| {row_num}")); } for item in self.format_row(row) { text.push(format!("| {item}")); } } text.push(self.get_footer()); text.join("\n") } async fn really_run(&self, runner: &Runner) { let should_run = match &runner.report { Some(wanted) => wanted == self.title(), None => true, }; let debug_mode = runner.report.is_some(); if should_run { match self.run(debug_mode, &runner.bot, &runner.pool).await { Ok(_) => {} Err(err) => { error!("{}", err.to_string()); } } } } async fn post_run(&self, _bot: &Bot, _debug_mode: bool) -> Result<()> { Ok(()) } fn subpage(&self, index: usize) -> String { format!("{}/{}", self.get_title(), index) } fn update_index(&self) -> bool { true } fn centralauth(&self) -> Result { info!("Setting up MySQL connection pool for centralauth..."); Ok(Pool::new( toolforge::connection_info!("centralauth", ANALYTICS)? .to_string() .as_str(), )) } fn title_for_update_check(&self) -> String { match self.rows_per_page() { Some(_) => self.subpage(1), None => self.get_title(), } } async fn run( &self, debug_mode: bool, bot: &Bot, pool: &Pool, ) -> Result<()> { // Bypass needs update check when --report is passed if debug_mode { info!("Passed --report, we're in debug mode"); } else { info!( "{}: Checking when last results were published...", self.get_title() ); let title_for_update_check = self.title_for_update_check(); let page = bot.page(&title_for_update_check)?; if page.exists().await? { let old_text = page.wikitext().await?; if !self.needs_update(&old_text)? { info!( "{}: Report is still up to date, skipping update.", self.get_title() ); return Ok(()); } } } let mut conn = pool.get_conn().await?; info!("{}: Starting query...", self.get_title()); let rows = self.run_query(&mut conn).await?; info!( "{}: Query finished, found {} rows", self.get_title(), &rows.len() ); match self.rows_per_page() { Some(rows_per_page) => { let iter = rows.chunks(rows_per_page); let mut index = 0; for chunk in iter { index += 1; let text = self.build_page(chunk, index); if debug_mode { info!("{}", &text); } else { let page = bot.page(&self.subpage(index))?; save_page(page, text).await?; } } // Now "Blank" any other subpages loop { index += 1; let page = bot.page(&self.subpage(index))?; if !page.exists().await? { break; } if debug_mode { info!("{}", BLANK_WIKITEXT); } else { save_page(page, BLANK_WIKITEXT.to_string()).await?; } } // Finally make sure the index page is up to date if self.update_index() { if debug_mode { info!("{}", INDEX_WIKITEXT); } else { save_page( bot.page(&self.get_title())?, INDEX_WIKITEXT.to_string(), ) .await?; } } } None => { // Just dump it all into one page let text = self.build_page(&rows, 1); if debug_mode { info!("{}", &text); } else { save_page(bot.page(&self.get_title())?, text).await?; } } } // Finally, publish the /Configuration subpage let config = format!( r#"{} == Source code == {} "#, self.frequency(), self.code() ); if debug_mode { info!("{}", &config); } else { save_page( bot.page(&format!("{}/Configuration", self.get_title()))?, config, ) .await?; } self.post_run(bot, debug_mode).await?; Ok(()) } } pub struct Runner { pub bot: Bot, pub pool: Pool, /// Requested report with --report report: Option, } impl Runner { pub async fn new( domain: &str, dbname: &str, report: Option, ) -> Result { let cfg = load_config().await?; let bot = Bot::builder( format!("https://{domain}/w/api.php"), format!("https://{domain}/api/rest_v1"), ) .set_oauth2_token(cfg.auth.username, cfg.auth.oauth2_token) .build() .await?; info!("Setting up MySQL connection pool for {}...", dbname); let pool = Pool::new( toolforge::connection_info!(dbname, ANALYTICS)? .to_string() .as_str(), ); Ok(Self { bot, pool, report }) } } pub fn dbr_link(target: &str) -> String { format!("{{{{dbr link|1={}}}}}", target.replace('_', " ")) } pub fn linker(ns: u32, target: &str) -> String { let colon = match ns { // File | Category 6 | 14 => ":", _ => "", }; let ns_prefix = match ns { 0 => "".to_string(), num => format!("{{{{subst:ns:{num}}}}}:"), }; format!("[[{colon}{ns_prefix}{target}]]") } /// "Escape" a block reason so it's safe for display /// in a table context pub fn escape_reason(text: &str) -> String { text // Escape templates .replace("{{", "{{tl|") // And HTML comments .replace(""), "{{tl|foo}} [[bar]] ".to_string() ) } #[test] fn test_timestamp() { let ts = "14:31, 3 January 2022 (UTC)"; let dt = PrimitiveDateTime::parse(ts, &SIG_TIMESTAMP) .unwrap() .assume_utc(); assert_eq!(dt.date(), date!(2022 - 01 - 03)); assert_eq!(dt.time(), time!(14:31:00)); } #[test] fn test_y_m_d() { assert_eq!(y_m_d("20010115192713"), "2001-01-15".to_string()); assert_eq!(y_m_d("20221015001541"), "2022-10-15".to_string()); } #[test] fn test_frequency() { assert_eq!( &Frequency::Daily.to_string(), "This report is updated every day." ); assert_eq!( &Frequency::Weekly.to_string(), "This report is updated every 7 days." ); assert_eq!( &Frequency::DailyAt(3).to_string(), "This report is updated every day at 3:00 UTC." ); } }