#!/usr/bin/php getMessage() ); } } } /** * Generate the WMCH user report 2020 for a specific wiki * * @param string $wiki_codename Wiki codename like 'itwiki' * @param string $wiki_database Wiki database name like 'itwiki_p' */ function generate_wmch_report_for_wiki( $wiki_codename, $wiki_database ) { // check if the database exists $wiki_exists = query_row( sprintf( "SELECT 1 FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = '%s'", esc_sql( $wiki_database ) ) ); // no wiki no party if( !$wiki_exists ) { throw new Exception( "missing database $wiki_database" ); } // name of the exported filename $filename_export = sprintf( '%s.csv', $wiki_codename ); // check if the file already exists $file_exists = file_exists( $filename_export ); // last processed user id $last_user_id = 0; // do not process twice a file if( $file_exists ) { message( "continuing already existing file $filename_export" ); $last_user_id = last_user_id_from_filename( $filename_export ); } // fields to be exported from the query and put in the CSV file $EXPORT_FIELDS = [ 'user_id', 'user_name', 'user_editcount', // // these fields are not populated in replicas // 'user_registration', // 'user_touched', ]; // Abbiano piĆ¹ di 5 anni di vita Wikipediana (quindi registrati almeno prima del 15 gennaio 2016) $max_user_registration_timestamp = date_to_mysql_timestamp( REGISTRATION_DATE_BEFORE ); // Con almeno 50 edit indifferenziati su almeno un singolo progetto tra il 15 gennaio 2020 e il 15 gennaio 2021 // (quindi se ne ha 20 su Wikipedia e 30 su Wikisource non va bene, ne deve avere almeno 50 su Wikipedia ad esempio). $min_edits_range_timestamp_after = date_to_mysql_timestamp( MIN_EDITS_RANGE_AFTER ); $min_edits_range_timestamp_before = date_to_mysql_timestamp( MIN_EDITS_RANGE_BEFORE ); // must have at least one contribution before a specific date $subquery_old_contributions = ( new Query() ) ->from( 'revision_userindex' ) ->equals( 'rev_actor', 'actor_id' ) ->whereStr( 'rev_timestamp', $max_user_registration_timestamp, '<=' ); // search in an efficient way if the contributor has at least some edits in a period // it selects "1" if the "LIMIT 50, 1" returns something $subquery_contributions_in_period = ( new Query() ) ->from( 'revision_userindex' ) ->equals( 'rev_actor', 'actor_id' ) ->whereStr( 'rev_timestamp', $min_edits_range_timestamp_after, '>=' ) ->whereStr( 'rev_timestamp', $min_edits_range_timestamp_before, '<=' ) // Efficient "minimum number of rows" statement: // We want to discover if this query has at least MIN_EDITS_RANGE rows // but note that the COUNT(*) function means a full table scan. It's expensive. Don't do that. // Why? Because COUNTing *all* his/her/its contributions just to discover if they are greater than "$min_edits" is inefficient. // Instead, we can use the LIMIT clause to skip the "MIN_EDITS_RANGE - 1" rows and return just the next following row if it exists; // so we will return zero or one row depending if this user has at least MIN_EDITS_RANGE contributions or not, // and this is freacking efficient because the LIMIT clause uses indexes. // -- boz ->limit( MIN_EDITS_RANGE - 1, 1 ); // export in a CSV file $out = fopen( $filename_export, 'a' ); if( $out === false ) { throw new Exception( "cannot open file $filename_export" ); } // lock the file as exclusive write mode // LOCK_EX: exclusive flag // LOCK_NB: make the call non-blocking if( !flock( $out, LOCK_EX|LOCK_NB ) ) { throw new Exception( "cannot lock file $filename_export" ); } // put an heading in the file with the exported field names if( !$file_exists ) { fputcsv( $out, $EXPORT_FIELDS ); } // process a reduced set of users at time do { // the final query to the user table with all the conditions $query = ( new Query() ) ->select( $EXPORT_FIELDS ) ->from( 'user' ) ->joinOn( 'INNER', 'actor', 'actor_user', 'user_id' ) // // this does not work because the user_registration field is NULL in Wikimedia Foundation replicas // ->whereInt( 'user_registration', $max_user_registration_timestamp, '<=' ) // continue the last bulk // this is zero if its the first time and don't care ->whereInt( 'user_id', $last_user_id, '>' ) // must have at least one old contribution ->whereExists( $subquery_old_contributions ) // must have some number of contributions in the specified period ->whereExists( $subquery_contributions_in_period ) // sort by user ID ->orderBy( 'user_id' ) // process this number of users ->limit( USERS_BATCH ); // select the desired database DB::instance()->selectDB( $wiki_database ); // show the current query message( "Executing query on $wiki_database after user $last_user_id" ); // query the results $users = $query->queryGenerator(); // last processed user from the next loop $user = null; // loop the users foreach( $users as $user ) { // export the user fields in the right order $data = []; foreach( $EXPORT_FIELDS as $field ) { $data[] = $user->{ $field }; } // write a line fputcsv( $out, $data ); } // end loop users // on the last processed user remember its ID if( $user ) { $last_user_id = $user->user_id; } } while( $user ); // end query splitter // close the CSV file fclose( $out ); } /** * Print a message * * @param string $message */ function message( $message ) { printf( "[%s] %s\n", date( 'Y-m-d H:i:s' ), $message ); } /** * Read the last line of a file * * See https://stackoverflow.com/a/1510204/3451846 * * @param string $file File pathname * @return string */ function last_file_line( $file ) { $last_line = ''; $fp = fopen( $file, 'r' ); // start from the end of the file fseek( $fp, -1, SEEK_END ); $pos = ftell( $fp ); // loop backword util "\n" is found // but if the last line is empty, skip it while( ( ( $c = fgetc( $fp ) ) !== "\n" || $last_line === '' ) && $pos > 0 ) { if( $c !== "\n" ) { $last_line = $c . $last_line; } fseek( $fp, $pos-- ); } fclose( $fp ); return $last_line; } /** * Try to extract the last user ID from a CSV of User IDs in the first column * * It returns zero (falsy) if the user id has no sense. * * @param string $filename * @return int|false */ function last_user_id_from_filename( $filename ) { $user_id = false; // try to read the last user ID $last_line = last_file_line( $filename ); if( $last_line ) { $user_id = (int) str_getcsv( $last_line )[0]; } return $user_id; } function date_to_mysql_timestamp( $date ) { $time = strtotime( $date ); return date( 'YmdHis', $time ); }