MediaWiki:Gadget-Fill Index.js

/* * Author: w:fr:Phe * * Import the contents of the "Book" template from Commons into the Index * page fields at Wikisource * * Modified: 2020-11-10:   More robust template handling to deal with Faebot *                         uploads (Inductiveload) *          2020-11-27:    Some simple heuristics to improve IA metadata *          2021-04-03:    Supports authors set with */

/* eslint-disable camelcase, one-var, vars-on-top */

( function ( mw, $, Promise ) {	'use strict';

// var FillIndex = { // };

function parse_template( text, template ) { // find the start of the the template in the wikitext var re = new RegExp( '{{ *' + template + '[ \n]*\\|', 'i' ), index = text.search( re );

// The template is not if ( index < 0 ) { return [ null, null ]; }

var tokens = [],

old_index = index,

token_list = [ [ '{{', '}}' ],				[ ,  ]			// ["[", "]"],			],

param_name = '', param_content = '', found_equals = false, pos_param_idx = 0,

params = {};

while ( index < text.length ) {

var handled_token = false;

for ( var i = 0; i < token_list.length; i += 1 ) { var cand_token = text.slice( index, index + token_list[ i ][ 0 ].length ); if ( cand_token === token_list[ i ][ 0 ] ) { tokens.push( cand_token ); index += cand_token.length;

if ( cand_token !== '{{' || Object.keys( params ).length > 0 ) { param_content += cand_token; }					handled_token = true; break; } else if ( cand_token === token_list[ i ][ 1 ] &&						tokens.slice( -1 )[ 0 ] === token_list[ i ][ 0 ] ) { tokens.pop; index += cand_token.length; param_content += cand_token; handled_token = true; break; }			}

if ( tokens.length === 0 ) { // end of template break; } else {

if ( text[ index ] === '|' && tokens.length === 1 ) { param_name = ''; param_content = ''; found_equals = false; } else if ( tokens.length === 1 &&						( text[ index + 1 ] === '|' || text.slice( index, index + 2 ) === '}}' ) ) { // end of a template parameter, save it					param_name = param_name.trim; param_content = param_content.trim;

if ( param_name.length === 0 ) { // positional parameter (pos=0 is the template name) params[ pos_param_idx ] = param_content; pos_param_idx += 1; } else { param_name = param_name[ 0 ].toUpperCase + param_name.slice( 1 ); params[ param_name ] = param_content; }				} else if ( text[ index + 1 ] === '=' && !found_equals ) { found_equals = true; param_name = param_content; param_content = ''; index += 1; // skip = } else if ( !handled_token ) { param_content += text[ index ]; }

if ( !handled_token ) { // tokens do their own lengths index += 1; }			}		}

if ( tokens.length === 0 ) { // got to end of template return [ params, text.slice( old_index, index ) ]; }		return [ null, text.slice( old_index, index ) ]; }

/*	 * Converts text to title case. *	 * BOOK IV. THE INSTRUCTIONS OF I -> Book IV. The Instructions of I.	 * * Takes care of: *  - all-caps roman numerals *  - always title-cases the first words after. *  - otherwise title-cases words except a list of exceptions like 'a', 'of' */	var toTitleCase = function ( str ) {

var titler = function ( word ) { if ( word.length === 0 ) { return word; }

return word.replace( word[ 0 ], word[ 0 ].toUpperCase ); };

var all_capped = function ( word ) { // check for roman numerals (and "I"), maybe followed by punct return ( word.search( /^[ivxlcdm]+\b.$/ ) > -1 ); };

// if bookish title case, not all words are capped var no_cap_words = [ 'a', 'an', 'be', 'the', 'of', 'on', 'to', 'at', 'this', 'than', 'then', 'by', 'and', 'for', 'with', 'in' ];

var words = str.toLowerCase.split( ' ' );

var titled = [];

var new_sentence = true;

for ( var i = 0; i < words.length; i++ ) {

if ( all_capped( words[ i ] ) ) { // some words are all caps always titled.push( words[ i ].toUpperCase );

} else if ( new_sentence || no_cap_words.indexOf( words[ i ] ) === -1 ) { // new sentences and most words get title casing titled.push( titler( words[ i ] ) ); } else { // lower titled.push( words[ i ] ); }

new_sentence = words[ i ].search( /\.$/ ) !== -1; }

return titled.join( ' ' ); };

var extract_dict = {}, field_names = {};

function setup_extract_dict { extract_dict = self.fill_index_data.extract_dict; field_names = self.fill_index_data.field_names; }

/**	 * Set the appropriate input field *	 * @param {string} idx    the field index * @param {string|Promise} content the new content, or a Promise that resolves it	 */ function set_field( idx, content ) {

// this resolves with either the raw value, or the resolution of the Promise // eslint-disable-next-line compat/compat Promise.resolve( content ).then( function ( content_value ) {			content_value = content_value.replace( / ([;:,]) ?/, '$1 ' );

// fix any sneaky double spaces content_value = content_value.replace( / +/g, ' ' );

var field_name = field_names[ idx ], f = document.getElementsByName( 'wpprpindex-' + field_name )[ 0 ];

if ( f ) { f.value = content_value; }		} );	}

function get_wd_author( qid ) {

// eslint-disable-next-line compat/compat return new Promise( function ( resolve, reject ) {			$.ajax( { url: '//wikidata.org/w/api.php', data: { format: 'json', action: 'wbgetentities', ids: qid, props: 'sitelinks' },				dataType: 'jsonp', cache: true, success: function ( data ) { var author = data.entities[ qid ].sitelinks[ mw.config.get( 'wgWikiID' ) ].title; console.log( author ); resolve( author ); },				error: function ( error ) { reject( error ); }			} );		} );	}

// returns a promise that resolves the author function process_author( str ) { str = str.replace( /^[*:][ ]*/, '' ); str = str.trim;

var author_promise;

if ( str.match( /Q[0-9]+/ ) ) {

author_promise = get_wd_author( str ); } else {

// strip dates - these are nearly always not needed str = str.replace( /(?:, )?(?:(?:ca\.|fl\.) )?(\(?\d+-\d+\)?).?$/, '' );

// strip birth date str = str.replace( /(?:, )(?:b\.|d\.) +\d{3,4}$/, '' );

// strip initial expansions str = str.replace( /(?:[A-Z]. ?)+ \((.*)\)/, '$1' );

str = str.replace( /, (Sir|Lord)$/, '' );

// Last, First -> First Last str = str.replace( /^([^,]+), ([^,]+)$/, '$2 $1' );

// Fix initials without dots str = str.replace( / ([A-Z]) /g, ' $1. ' );

// Fix bogus fullstops // str = str.replace(/(?<!Jr|Sr)\.$/, "");

// just resolve right now author_promise = Promise.resolve( str ); }

return author_promise.then( function ( author ) {			// prevent the pipe trick triggering on the JS			// eslint-disable-next-line no-useless-concat			return '[' + '[' + self.fill_index_data.ns_author_name + ':' + author + '|]]';		} ); }

// returns a promise that resolves with the processed author list function process_authors( str ) {

// strip creator templates: str = str.replace( //g, '$1\n' );

// TODO: fix wikidata here str = str.replace( //g, '$1\n' );

var as = str.split( '\n' );

as = as.filter( function ( s ) {			return !!s.trim;		} );

// map array to promises var promises = as.map( function ( author ) {			return process_author( author );		} );

// eslint-disable-next-line compat/compat return Promise.all( promises ).then( function ( results ) {			var list = results.join( ', ' );			// console.log( list );			return list;		} ); }

function split_city_publisher( str ) {

// most books are published in a few cities var cities = [ /London/, /Edinburgh/, /Oxford/, /Cambridge/, /New York/, /Boston/, /Philadelphia/, /Washington D. ?C./, /Paris/, /Berlin/, /Stuttgart/, /Jena/, /Hong Kong/, /Shanghai/, /Calcutta/, /Bombay/, /Delhi/ ],

city = '', publisher = str, parts;

if ( str.indexOf( ':' ) > -1 ) { // a colon: assume this is a city: publisher parts = str.split( ':' ); city = parts[ 0 ]; publisher = parts.slice( 1 ).join( ':' ); } else {

parts = str.split( /[,;:] / );

if ( parts.length > 1 ) { for ( var i = 0; i < cities.length; i++ ) { if ( parts[ 0 ].match( cities[ i ] ) ) { city = parts[ 0 ]; publisher = parts.slice( 1 ).join( ', ' ); break; }				}			}		}

return [ publisher.trim, city.trim ]; }

function processVolume( v ) {

// first, strip off either : Foo or (Foo): var match = v.match( /(.*?) *(?:\((.*)\)|: *(.*))?$/ );

var vol = v;		var v_desc = ''; if ( match ) { vol = match[ 1 ]; v_desc = match[ 2 ]; }

// Add "Volume " if it looks like we need it vol = vol.replace( /^(?:(?:vol|v)\. ?)?([-0-9]+)$/i, 'Volume $1' );

return [ vol, v_desc ]; }

function processCity( c ) { c = c.replace( /\{\{ *City *\| *(.*?) *\}\}/i, '$1' ); return c;	}

function extract_content( data ) { var importationDone = false; // until Object entries is allowed // eslint-disable-next-line no-jquery/no-each-util $.each( data.query.pages, function ( ids, page ) {			if ( ids < 0 ) {				return;			}

var content = page.revisions[ 0 ][ '*' ], temp_parsed = parse_template( content, 'Book' );

if ( temp_parsed[ 0 ] === null ) { console.error( 'Failed to parse Book template' ); } else {

var title = temp_parsed[ 0 ][ extract_dict.Title ];

if ( title ) { title = toTitleCase( title ); set_field( 'Title', "" + title + "" ); }

for ( var idx in extract_dict ) {

var template_content = '';

if ( typeof extract_dict[ idx ] === 'string' ) { template_content = temp_parsed[ 0 ][ extract_dict[ idx ] ]; } else { // find the first matching parameter for ( var i = 0; i < extract_dict[ idx ].length; i++ ) {

template_content = temp_parsed[ 0 ][ extract_dict[ idx ][ i ] ];

if ( template_content !== undefined && template_content.length > 0 ) { break; }						}					}

if ( template_content !== undefined && template_content.length > 0 ) { switch ( idx ) { case 'Editor': case 'Author': case 'Translator': case 'Illustrator': set_field( idx, process_authors( template_content ) ); break; case 'Publisher': // it is very common for the Commons publisher field // to contain the location var pub_city = split_city_publisher( template_content );

if ( pub_city[ 1 ].length > 0 ) { set_field( 'Publisher', pub_city[ 0 ] ); set_field( 'City', pub_city[ 1 ] ); } else { set_field( 'Publisher', pub_city[ 0 ] ); }								break; case 'Volume':

var v = processVolume( template_content );

var v_field; if ( title !== undefined ) { v_field =  + v[ 0 ] + ; } else { // fallback v_field = v[ 0 ]; }

if ( v[ 1 ] ) { v_field += ' (' + v[ 1 ] + ')'; }								set_field( idx, v_field ); break; case 'Title': break; case 'City': set_field( idx, processCity( template_content ) ); break; default: set_field( idx, template_content ); }					}				}			}

// set the file type selector set_field( 'Source', mw.config.get( 'wgTitle' ).split( '.' ).slice( -1 )[ 0 ] );

// set the sort key {				var skTitle = temp_parsed[ 0 ][ extract_dict.Title ]; if ( skTitle !== undefined ) { var titlewords = skTitle.split( ' ' ); if ( [ 'The', 'A', 'An', 'Of' ].indexOf( titlewords[ 0 ] ) >= 0 ) { skTitle = titlewords.slice( 1 ).join( ' ' ) + ', ' + titlewords[ 0 ]; skTitle = skTitle[ 0 ].toUpperCase + skTitle.slice( 1 ); set_field( 'Key', skTitle ); }				}			}			importationDone = true; } );

return importationDone; }

function common_content( data ) { if ( !extract_content( data ) ) { $.ajax( {				url: mw.util.wikiScript( 'api' ),				data: {					format: 'json',					action: 'query',					prop: 'revisions',					rvprop: 'content',					titles: 'File:' + mw.config.get( 'wgTitle' )				}			} ) .done( extract_content ); }	}

function do_extraction { $.ajax( {			url: '//commons.wikimedia.org/w/api.php',			data: {				format: 'json',				action: 'query',				prop: 'revisions',				rvprop: 'content',				titles: 'File:' + mw.config.get( 'wgTitle' )			},			dataType: 'jsonp'		} ) .done( common_content ); }

function setup { setup_extract_dict; // eslint-disable-next-line no-jquery/no-global-selector if ( $( '.mw-newarticletext' ).length === 0 ) {

// Portlet link to re-extract var portlet = mw.util.addPortletLink(				'p-tb',				'#',				'Re-fill index',				't-refill-index',				'Re-import this index page\'s data from the Commons file'			);

$( portlet ).on( 'click', function ( e ) {				e.preventDefault;				do_extraction;			} ); } else { do_extraction; }	}

/* Localisation section, you can provide your own data before loading this script to * change the script behavior */	if ( !self.fill_index_data ) { self.fill_index_data = {}; }

if ( !self.fill_index_data.ns_author_name ) { self.fill_index_data.ns_author_name = 'Author'; }

if ( !self.fill_index_data.extract_dict ) { // Commons Book template field names // Should not need to be internationalised self.fill_index_data.extract_dict = { Editor: 'Editor', Publisher: 'Publisher', Author: 'Author', Translator: 'Translator', Volume: 'Volume', Illustrator: 'Illustrator', 'Image page': 'Image page', Title: 'Title', Date: [ 'Publication date', 'Date' ], City: 'City', Source: 'Source', LCCN: 'LCCN', OCLC: 'OCLC' };	}

if ( !self.fill_index_data.field_names ) { // Proofread page field names self.fill_index_data.field_names = { Editor: 'Editor', Publisher: 'Publisher', Author: 'Author', Translator: 'Translator', Volume: 'Volume', Illustrator: 'Illustrator', 'Image page': 'Image', Title: 'Title', Date: 'Year', City: 'Address', Source: 'Source', LCCN: 'LCCN', OCLC: 'OCLC', Key: 'Key' };	}	/* end of localisation section */

$( function {		if ( mw.config.get( 'wgCanonicalNamespace' ) === 'Index' && mw.config.get( 'wgAction' ) === 'edit' ) {			setup;		}	} ); // eslint-disable-next-line no-undef }( mediaWiki, jQuery, Promise ) );