User:Iridibot/Plugins/wizardBot

From Wikinews, the free news source you can write!
Jump to navigation Jump to search

wizardBot[edit]

This plugin to iridibot is designed to check the spelling of recently modified articles and try and identify mistakes. The bot categorizes every word it has seen, and filters them into white or black lists. Keen. irid t i e 03:35, 25 September 2007 (UTC)

Source to wizardBot.php[edit]

<?php

define('APIBASE', 'http://en.wikinews.org/w/');
define('BOTNAME', 'wizardbot');
define('BOTLOGIN', '');
define('BOTPASSWORD', '');
define('BOTVERSION', '1.76alpha');
define('DEFAULTAPIPARAMS', 'format=php');

define('WZB_MINWORDLEN', 4); // minimum word length to be considered

include_once('wizardBot.fn.php');

if (login() === false)
	die("Login failed. Giving up.\n\n");

$batch = array();
$rc = fetchRC();
foreach($rc['query']['recentchanges'] as $rev)
{
	if (isset($batch[$rev['pageid']]))
		continue;
	
	$q = mysql_query('SELECT rev FROM articles WHERE aid='.(int)$rev['pageid'].' LIMIT 1');
	if ($q === false)
		die('query failed: '.mysql_error());
	
	if (mysql_num_rows($q) === 0)
	{
		mysql_query('INSERT INTO articles SET aid='.(int)$rev['pageid'].', rev='.(int)$rev['revid']);
		$scan = true;
	}
	else
	{
		$r = mysql_fetch_row($q);
		if ((int)$r[0] < (int)$rev['revid'])
		{
			$scan = true;
			$batch[$rev['pageid']] = true;
		}
	}
	if ($scan)
	{
		$results = checkArticle($rev['title'], $rev['pageid'], $rev['revid']);
		print($rev['title'].' ('.$rev['pageid'].'): bl='.$results['blo'].' uc='.$results['nwo'].' ad='.$results['awo']."\n");
		$scan = false;	
	}
	mysql_free_result($q);
}

?>

Source to wizardBot.fn.php (shared)[edit]

<?php

function newCurlObject($uri='api.php')
{
	$c = curl_init(APIBASE.$uri);
	if ($c === false) return false;
	curl_setopt($c, CURLOPT_HEADER, false);
	curl_setopt($c, CURLOPT_POST, true);
	curl_setopt($c, CURLOPT_RETURNTRANSFER, true);
	curl_setopt($c, CURLOPT_COOKIEFILE, '/tmp/'.BOTNAME.'.cookies');
	curl_setopt($c, CURLOPT_COOKIEJAR, '/tmp/'.BOTNAME.'.cookies');
	curl_setopt($c, CURLOPT_USERAGENT, BOTNAME.' v'.BOTVERSION.' (as '.BOTLOGIN.')');
	return($c);
}

function curl_makePOST($c, $p)
{
	$t = DEFAULTAPIPARAMS;
	if (!is_array($p)) return;
	foreach($p as $k=>$v)
	{
		if (strlen($t) > 0) $t .= '&';
		$t .= $k.'='.urlencode($v);
	}
	curl_setopt($c, CURLOPT_POSTFIELDS, $t);
}

function login()
{
	$c = newCurlObject();
	if ($c === false) return false;
	curl_setopt($c, CURLOPT_COOKIESESSION, true);
	curl_makePOST($c, array('action'=>'login','lgname'=>BOTLOGIN,'lgpassword'=>BOTPASSWORD));
	sleep(5);
	$result = curl_exec($c);
	curl_close($c);
	unset($c);
	$r = unserialize($result);
	if (isset($r['login']) and $r['login']['result'] == 'Success')
		return true;
	return false;
}

function fetchRC($ns = 0)
{
	$c = newCurlObject();
	if ($c === false) return false;
	curl_makePOST($c, array('action'=>'query','list'=>'recentchanges','rcnamespace'=>$ns));
	sleep(3);
	$result = curl_exec($c);
	curl_close($c);
	unset($c);
	
	return(unserialize($result));
}

function fetchArticleContent($article)
{
	$c = newCurlObject();
	if ($c === false) return false;
	curl_makePOST($c, array('action'=>'query','prop'=>'revisions','titles'=>$article,'rvprop'=>'content'));
	sleep(12);
	$result = curl_exec($c);
	curl_close($c);
	unset($c);
	
	$array = unserialize($result);
	if (!is_array($array['query']['pages']))
		die('oh crap. get john stossel over here, we need a shovel.');
	$array = array_shift($array['query']['pages']);
	return($array['revisions']['0']['*']);
}

function getWordList($text)
{
	$text = preg_replace('/\[\[.+?\]\]/','',$text);				// wikilinks
	$text = preg_replace('/{{.+?}}/','',$text);					// templates/parameters
	$text = preg_replace('/<!--.+?-->/','',$text);				// comments
	$text = str_replace("'''",'',$text);						// rm bold text markup
	$text = str_replace("''",'',$text);							// rm italic text markup
	$text = preg_replace('/http:\/\/.+?\s/','',$text);			// URLs

	$words = preg_split('/[^a-zA-Z\']/', $text, -1, PREG_SPLIT_NO_EMPTY);
	unset($text);
	$wl = array();
	foreach($words as $v)
	{
		if(strlen($v) < WZB_MINWORDLEN)
			continue; // ignore short words
		
		$v = preg_replace('/^\'/','', $v); // yikes. let's waste some more cpu.
		$v = preg_replace('/\'$/','', $v);
		
		if (!isset($wl[strtolower($v)]))
			$wl[strtolower($v)] = 1;
		else
			$wl[strtolower($v)]++;
	}
	ksort($wl);
	return($wl);
}

function checkArticle($title, $id, $revset = false)
{
	$blo = 0;
	$wlo = 0;
	$nwo = 0;
	$awo = 0;
	$tot = 0;
	foreach(getWordList(fetchArticleContent($title)) as $k=>$v)
	{
		$tot ++;
		$q = mysql_query('SELECT list FROM words WHERE word="'.mysql_real_escape_string($k).'"');
		if ($q === false)
			continue;
		else if (mysql_num_rows($q) === 0)
		{
			$awo ++;
			$q2 = mysql_query('INSERT INTO words SET word="'.mysql_real_escape_string($k).'"');
			if ($q2 === false)
				continue;
		}
		else
		{
			$r = mysql_fetch_row($q);
			if ($r[0] == 'black')
				$blo ++;
			else if ($r[0] == 'white')
				$wlo ++;
			else
				$nwo ++;
		}
		if ($q !== false)
			mysql_free_result($q);
	}
	
	if ($revset !== false)
		$extra = ',rev='.(int)$revset;
	else
		$extra = '';
	
	mysql_query('UPDATE articles SET blistw='.$blo.', wlistw='.$wlo.', ulistw='.($nwo+$awo).$extra.',aname="'.mysql_real_escape_string($title).'" WHERE aid='.$id);
	return(array('blo'=>$blo,'wlo'=>$wlo,'nwo'=>$nwo,'awo'=>$awo,'tot'=>$tot));
}

mysql_connect('localhost','currentBot','currentBot');
mysql_select_db('wizardBot');

?>

Screenshots[edit]

The word maintenance script