User:Iridibot/Plugins/wizardBot
< User:Iridibot | Plugins
wizardBot[edit]
This plugin to iridibot is designed to check the spelling of recently modified articles and try and identify mistakes. The bot categorizes every word it has seen, and filters them into white or black lists. Keen. irid t i e 03:35, 25 September 2007 (UTC)
Source to wizardBot.php[edit]
<?php
define('APIBASE', 'http://en.wikinews.org/w/');
define('BOTNAME', 'wizardbot');
define('BOTLOGIN', '');
define('BOTPASSWORD', '');
define('BOTVERSION', '1.76alpha');
define('DEFAULTAPIPARAMS', 'format=php');
define('WZB_MINWORDLEN', 4); // minimum word length to be considered
include_once('wizardBot.fn.php');
if (login() === false)
die("Login failed. Giving up.\n\n");
$batch = array();
$rc = fetchRC();
foreach($rc['query']['recentchanges'] as $rev)
{
if (isset($batch[$rev['pageid']]))
continue;
$q = mysql_query('SELECT rev FROM articles WHERE aid='.(int)$rev['pageid'].' LIMIT 1');
if ($q === false)
die('query failed: '.mysql_error());
if (mysql_num_rows($q) === 0)
{
mysql_query('INSERT INTO articles SET aid='.(int)$rev['pageid'].', rev='.(int)$rev['revid']);
$scan = true;
}
else
{
$r = mysql_fetch_row($q);
if ((int)$r[0] < (int)$rev['revid'])
{
$scan = true;
$batch[$rev['pageid']] = true;
}
}
if ($scan)
{
$results = checkArticle($rev['title'], $rev['pageid'], $rev['revid']);
print($rev['title'].' ('.$rev['pageid'].'): bl='.$results['blo'].' uc='.$results['nwo'].' ad='.$results['awo']."\n");
$scan = false;
}
mysql_free_result($q);
}
?>
[edit]
<?php
function newCurlObject($uri='api.php')
{
$c = curl_init(APIBASE.$uri);
if ($c === false) return false;
curl_setopt($c, CURLOPT_HEADER, false);
curl_setopt($c, CURLOPT_POST, true);
curl_setopt($c, CURLOPT_RETURNTRANSFER, true);
curl_setopt($c, CURLOPT_COOKIEFILE, '/tmp/'.BOTNAME.'.cookies');
curl_setopt($c, CURLOPT_COOKIEJAR, '/tmp/'.BOTNAME.'.cookies');
curl_setopt($c, CURLOPT_USERAGENT, BOTNAME.' v'.BOTVERSION.' (as '.BOTLOGIN.')');
return($c);
}
function curl_makePOST($c, $p)
{
$t = DEFAULTAPIPARAMS;
if (!is_array($p)) return;
foreach($p as $k=>$v)
{
if (strlen($t) > 0) $t .= '&';
$t .= $k.'='.urlencode($v);
}
curl_setopt($c, CURLOPT_POSTFIELDS, $t);
}
function login()
{
$c = newCurlObject();
if ($c === false) return false;
curl_setopt($c, CURLOPT_COOKIESESSION, true);
curl_makePOST($c, array('action'=>'login','lgname'=>BOTLOGIN,'lgpassword'=>BOTPASSWORD));
sleep(5);
$result = curl_exec($c);
curl_close($c);
unset($c);
$r = unserialize($result);
if (isset($r['login']) and $r['login']['result'] == 'Success')
return true;
return false;
}
function fetchRC($ns = 0)
{
$c = newCurlObject();
if ($c === false) return false;
curl_makePOST($c, array('action'=>'query','list'=>'recentchanges','rcnamespace'=>$ns));
sleep(3);
$result = curl_exec($c);
curl_close($c);
unset($c);
return(unserialize($result));
}
function fetchArticleContent($article)
{
$c = newCurlObject();
if ($c === false) return false;
curl_makePOST($c, array('action'=>'query','prop'=>'revisions','titles'=>$article,'rvprop'=>'content'));
sleep(12);
$result = curl_exec($c);
curl_close($c);
unset($c);
$array = unserialize($result);
if (!is_array($array['query']['pages']))
die('oh crap. get john stossel over here, we need a shovel.');
$array = array_shift($array['query']['pages']);
return($array['revisions']['0']['*']);
}
function getWordList($text)
{
$text = preg_replace('/\[\[.+?\]\]/','',$text); // wikilinks
$text = preg_replace('/{{.+?}}/','',$text); // templates/parameters
$text = preg_replace('/<!--.+?-->/','',$text); // comments
$text = str_replace("'''",'',$text); // rm bold text markup
$text = str_replace("''",'',$text); // rm italic text markup
$text = preg_replace('/http:\/\/.+?\s/','',$text); // URLs
$words = preg_split('/[^a-zA-Z\']/', $text, -1, PREG_SPLIT_NO_EMPTY);
unset($text);
$wl = array();
foreach($words as $v)
{
if(strlen($v) < WZB_MINWORDLEN)
continue; // ignore short words
$v = preg_replace('/^\'/','', $v); // yikes. let's waste some more cpu.
$v = preg_replace('/\'$/','', $v);
if (!isset($wl[strtolower($v)]))
$wl[strtolower($v)] = 1;
else
$wl[strtolower($v)]++;
}
ksort($wl);
return($wl);
}
function checkArticle($title, $id, $revset = false)
{
$blo = 0;
$wlo = 0;
$nwo = 0;
$awo = 0;
$tot = 0;
foreach(getWordList(fetchArticleContent($title)) as $k=>$v)
{
$tot ++;
$q = mysql_query('SELECT list FROM words WHERE word="'.mysql_real_escape_string($k).'"');
if ($q === false)
continue;
else if (mysql_num_rows($q) === 0)
{
$awo ++;
$q2 = mysql_query('INSERT INTO words SET word="'.mysql_real_escape_string($k).'"');
if ($q2 === false)
continue;
}
else
{
$r = mysql_fetch_row($q);
if ($r[0] == 'black')
$blo ++;
else if ($r[0] == 'white')
$wlo ++;
else
$nwo ++;
}
if ($q !== false)
mysql_free_result($q);
}
if ($revset !== false)
$extra = ',rev='.(int)$revset;
else
$extra = '';
mysql_query('UPDATE articles SET blistw='.$blo.', wlistw='.$wlo.', ulistw='.($nwo+$awo).$extra.',aname="'.mysql_real_escape_string($title).'" WHERE aid='.$id);
return(array('blo'=>$blo,'wlo'=>$wlo,'nwo'=>$nwo,'awo'=>$awo,'tot'=>$tot));
}
mysql_connect('localhost','currentBot','currentBot');
mysql_select_db('wizardBot');
?>