User:NewsieBot/WikiBot.class.php
Jump to navigation
Jump to search
<?php
/**
* Title: WikiBot.class.php
* Author(s): Brian McNeil ([[n:Brian McNeil]])
* Version: 0.0.3-0
* Date: October 13, 2012
* Description:
* Basic class for interaction with MediaWiki. Handles login/out,
* fetching pages, and writing pages.
*
* Copyright: CC-BY-2.5 (See Creative Commons website for full terms)
*
* History
* 0.0.3-0 2012-10-13 Brian McNeil
* Document now at most-basic functions.
**/
require_once(CLASSPATH.'HTTPcurl.class.php');
/**
* This is the base class for interacting with a MediaWiki install
* @author Brian McNeil
**/
class WikiBot {
// For safety, should user mangle data for wiki, the test
// wiki is set as a default and the path to the API is as-per
// the usual default for a 'vanilla' MediaWiki install.
const DEFAULT_wiki = 'http://test.wikipedia.org';
const DEFAULT_api = '/w/api.php';
const API_qry = '?action=query&format=php';
const API_parse = '?action=parse&format=php';
private $bot;
/**
* Constructor, create an instance of the class for a particular wiki and user/pass
* @param $wiki_url The "base" URL for the wiki (eg http://test.wikipedia.org)
* @param $wiki_api Optional path to the wiki's api.php. Defaults to standard for WMF installs
* @param $ht_user An optional username if HTTP-Auth is in-use
* @param $ht_pass An optional password for HTTP-Auth (not same as user/pass for wiki login)
* @param $quiet Default true; optional parameter to make the class output tracing comments/messages
* @return True/False depending on success.
**/
function __construct($wiki_url=self::DEFAULT_wiki, $wiki_api=self::DEFAULT_api, $ht_user=null, $ht_pass=null, $quiet=true ) {
$r = array();
// Create a cURL instance for exclusive use by the bot
$r = $this->init_cURL( $wiki_url, $wiki_api);
if ($r == false) {
return false;
}
// If HTTP-Auth used, store those details
if ($ht_user !== null) {
$r['cURL']->HTTP_auth( $ht_user, $ht_pass );
}
// Revision ID/Page will be tracked to allow bots to detect
// edit conflicts.
$r['revid'] = null;
$r['pagetitle'] = null;
$r['rev_time'] = null;
// Thow around some error stuff
$r['error'] = null;
$r['errcode'] = null;
// Default behaviours - These reset on every page write!
$r['bot'] = true; // We're a bot
$r['minor'] = false; // We don't make minor edits
$r['conflict'] = true; // Respect edit conflicts
$r['newpage'] = false; // Not writing new page unless say so
$r['readtime'] = 0; // Set this to time page retrieved if not
// writing most-recently-read page.
// Keep quiet unless told otherwise
$r['quiet'] = $quiet;
$this->bot = $r;
return true;
}
/**
* A generalised get function which accesses the bot array
* @param $var Variable being sought
* @return The value from the variable, or null
**/
public function __get( $var ) {
// Explicitly block access to the cURL object
if ( $var == 'cURL' ) {
$this->bot['error'] = "Access to cURL details not permitted";
$this->bot['errcode'] = 'fatal';
return false;
}
// Special case - bot username
if ( $var == 'user' ) {
if (isset($this->bot['credentials']))
return $this->bot['credentials']['lgname'];
}
// No trying anything 'cute'; only accept string
if (!is_string($var)) {
$this->bot['error'] = "Invalid variable access attempt, must be string containing variable name";
$this->bot['errcode'] = 'warning';
return false;
}
// If we've got a relevant variable, return it
if (isset($this->bot[$var]))
return $this->bot[$var];
return null;
}
/**
* A limited-scope 'set' function.
* @param $var 'bot' variable to set
* @param $value Value to assign to variable
* @return True if successful, false if fails
**/
public function __set( $var, $value ) {
if ( $this->quiet == false )
echo "Request to set variable: $var to '$value'\r\n";
if ( $var == 'cURL' || $var == 'credentials' ) {
$this->error = "Setting protected variable outside object creation not permitted.";
$this->errcode = 'fail';
return false;
}
if ( isset( $this->bot[$var] ) ) {
$this->bot[$var] = $value;
return true;
} else {
$this->error = "Request to set undefined object variable for WikiBot class.";
$this->errcode = 'warning';
return false;
}
}
/**
* Configuration function, creates, and returns, the cURL instance
* @return false if fails, otherwise an array holding
* the cURL instance and the wiki's API URL
**/
private function init_cURL( $wiki, $api ) {
$r = array();
$r['cURL'] = new HTTPcurl();
if ( !$r['cURL'] ) {
return false;
}
$r['URL'] = $wiki.$api;
$r['token'] = null;
$r['timestamp'] = null;
$r['cURL']->param['quiet'] = false; // try to make noisy
return $r;
}
/**
* Destructor; frees up the bot instance
* @return void
**/
function __destruct() {
unset($this->bot);
}
/**
* 'Raw' query function; sends a query to the target MediaWiki instance
* with the assumption the relevant API string is in-place.
* @param $query Passed-in query string (eg '&prop=revisions&title=Foo')
* @param $postdata Optional data to go by POST method
* @return False if fails, or unserialized result data
**/
private function query( $query, $postdata = null ) {
if ( $this->quiet == false ) echo "Doing query: $q \r\n";
$r = null;
$wURL = $this->URL;
if ($postdata == null ) {
if ( $this->quiet == false ) echo " Request type: GET\r\n";
$r = $this->bot['cURL']->http_get($wURL.$query);
} else {
if ( $this->quiet == false ) echo " Request type: POST\r\n";
$r = $this->bot['cURL']->http_post($wURL.$query, $postdata);
}
if (!$r) {
$this->bot['error'] = "Error with cURL library";
$this->bot['errcode'] = 'fatal';
return false;
}
return unserialize($r);
}
/**
* API query function; sends a query to the target MediaWiki using the API
* @param $query Passed-in query string (eg '&prop=revisions&title=Foo')
* @param $postdata Optional data to go by POST method
* @return False if fails, or unserialized result data
**/
function query_api( $query, $postdata = null ) {
if ( $this->quiet == false ) echo "API query of: $query \r\n";
$q = self::API_qry.$query;
return $this->query($q, $postdata);
}
function query_content( $query, $postdata = null ) {
if ( $this->quiet == false ) echo "Content request of: $query \r\n";
$q = self::API_parse.$query;
return $this->query($q, $postdata);
}
/**
* Wiki login function
* @param $user Username to log in with
* @param $pass Password for the user
* @return False if fails, or array of data from the API if succeeds
**/
function login( $user = null, $pass = null ) {
if ( $this->quiet == false ) echo "Logging in, user: $user \r\n";
$q = '?action=login&format=php';
// If the username is passed in, then we use what we're given
if ($user !== null) {
// Save the credentials we got before trying to use them
$postdata = array(
'lgname' => $user,
'lgpassword' => $pass
);
$this->bot['credentials'] = $postdata;
} else {
if ( $this->quiet == false ) echo " Trying to retrieve saved credentials\r\n";
// Otherwise, try to retrieve saved credentials
if (isset($this->bot['credentials'])) {
$postdata = $this->bot['credentials'];
} else {
$this->bot['error'] = "Login failed; no credentials supplied";
$this->bot['errcode'] = 'fatal';
return false; // Fail, don't have any saved credentials
}
}
// Start trying to log in...
$r = $this->query( $q, $postdata );
if (isset($r['login']['result'])) {
// Token required in more-recent MediaWiki versions
if ($r['login']['result'] == 'NeedToken') {
$postdata['lgtoken'] = $r['login']['token'];
$r = $this->query( $q, $postdata );
}
} else {
$this->bot['error'] = "Login failed; no result returned";
$this->bot['errcode'] = 'fatal';
return false; // It failed to give a result at-all
}
if (isset($r['login']['result'])) {
if ($r['login']['result'] !== 'Success') {
// The login failed, probably incorrect credentials
$this->bot['error'] = "Login failed; returned:".$r['login']['result'];
$this->bot['errcode'] = 'fatal';
return false;
} else {
return $r;
}
} else {
$this->bot['error'] = "Login failed; no result returned";
$this->bot['errcode'] = 'fatal';
return false; // Again, didn't get returned a result.
}
}
/**
* Logout function
* @return Falls out, thus returning null
**/
function logout() {
if ( $this->quiet == false ) echo "Logging out\r\n";
$this->query( '?action=logout&format=php' );
}
/**
* General 'page-fetching' function
* @param $page The title of the required page
* @param $gettoken If an edit token is required, also results in the
* page's timestamp and revid being saved.
* @param $revid The revision ID (optional) to be fetched
* @return False if fails, or wikitext of desired page
**/
function get_page( $page, $gettoken = false, $revid = null ) {
if ( $this->quiet == false ) echo "Fetching page: $page \r\n";
// If asked for an edit token when fetching page, query differs
if ( $gettoken ) {
if ( $this->quiet == false ) echo " Asking edit token\r\n";
$q = '&prop=revisions|info&intoken=edit';
} else {
$q = '&prop=revisions';
}
$q .= '&titles='.urlencode($page).'&rvlimit=1&rvprop=content|timestamp|ids';
// If asking for specific version, select such
if ($revid !== null )
$q .= '&rvstartid='.$revid;
$r = $this->query_api( $q );
if (!$r) {
$this->bot['error'] = "No data returned by MediaWiki API";
$this->bot['errcode'] = 'fatal';
return false;
}
foreach ($r['query']['pages'] as $t_page) {
// Now, stash page fetched and the revision ID.
$this->bot['pagetitle'] = $page;
$this->bot['rev_time'] = $t_page['revisions'][0]['timestamp'];
$this->bot['revid'] = $t_page['revisions'][0]['revid'];
// Save details of the edit token and the 'edit' start timestamp
if ($get_token !== false ) {
$this->bot['token'] = $t_page['edittoken'];
$this->bot['timestamp'] = $t_page['starttimestamp'];
}
// Return the wiki-markup page content
return $t_page['revisions'][0]['*'];
}
// If we hit here, we've not got a page back
$this->bot['error'] = "Unknown error fetching wiki page";
$this->bot['errcode'] = 'warning';
return false;
}
/**
* Page write function.
* This function will handle most page write permutations; certain options can be
* 'tweaked' by setting extra variables (eg: $bot->conflict = false to ignore edit conflicts)
* Those variable options are reset to: mark edits as bot, not minor, respect edit conflicts,
* not writing new pages. It is also assumed (unless specified in $bot->readtime) that
* the page being written was the last read. If not, $bot->readtime must contain the
* timestamp of the retrieved revision.
* @param $title Title of page being accessed/written
* @param $content Content of page, or section, to write
* @param $summary Edit summary, or new section name
* @param $section A numeric string for section number to edit or 'new' to append new
* @return False if fails, otherwise the data returned by the API.
**/
function write_page( $title, $content, $summary = null, $section = null ) {
if ( $this->quiet == false ) echo "Writing to page:$title \r\n";
$q = '?action=edit&format=php';
$post = array(
'title' => $title,
'summary' => $summary,
($this->bot?'bot':'notbot') => true,
($this->minor?'minor':'notminor') => true
);
// Grab timestamp, even if not going to use it later.
$e_timestamp = $this->rev_time;
if ( $this->readtime !== 0 ) {
$e_timestamp = $r_time;
} elseif ( !$this->newpage ) {
// Null timestamp, must be editing last-page retrieved if $new not true
if ( $this->pagetitle !== $title ) {
$this->bot['error'] = "Cannot update a page that not previously retrieved\r\n" ;
$this->bot['errcode'] = 'warning';
return false;
}
}
if ( $this->conflict ) {
$post['basetimestamp'] = $e_timestamp; // Try catch edit conflicts
} else {
$post['recreate'] = true; // Or overwrite anything
}
// Handle writing new section, or updating a section
if ( $section !== null ) {
if ( $section == 'new' ) { // Appending a new section
$post['section'] = 'new';
} else {
$post['section'] = $section; // This assumes the variable holds a string integer
}
$post['sectiontitle'] = $summary;
}
$post['text'] = $content;
$post['token'] = $this->token;
// Default behaviours - These reset on every page write!
$this->bot['bot'] = true; // We're a bot
$this->bot['minor'] = false; // We don't make minor edits
$this->bot['conflict'] = true; // Respect edit conflicts
$this->bot['newpage'] = false; // Not writing new page unless say so
$this->bot['readtime'] = 0; // Set this to time page retrieved if not
// writing most-recently-read page.
$result = $this->query( $q, $post );
if ( isset($result['error']) ) {
$this->bot['error'] = $result['error']['info'];
$this->bot['errcode'] = $result['error']['code'];
return false;
}
return $result;
}
/**
* Function to retrieve a page's table of contents (index)
* @param $page The title of the required page
* @param $revid Optional, the revision ID to fetch the TOC of
* @return False if fails, or the TOC in an array.
* Note failure only if no page.
**/
function get_toc( $page, $revid = null ) {
$toc = array();
$toc[] = array(
'index' => '0',
'heading' => '',
'level' => '0',
'page' => $page,
'number' => '0'
);
if ( $this->quiet == false ) echo "Fetching TOC for: $page \r\n";
$q = '&prop=sections&page='.urlencode($page);
if ( $revid !== null )
$q .= '&rvstartid='.$revid;
$r = $this->query_content( $q );
if ( isset($r['error']) ) { // Error getting any page data
$this->bot['error'] = $r['error']['string'];
$this->bot['errcode'] = 'error';
return false;
}
$toc_elem = $r['parse']['sections'];
if ( empty($toc_elem) ) { // Empty, does that mean page doesn't exist?
if ( $this->get_page( $page ) == false) {
$this->bot['error'] = "Requested TOC for nonexistent page";
$this->bot['errcode'] = 'warning';
return false;
}
}
foreach ($toc_elem as $line ) {
$toc[] = array(
'index' => $line['index'],
'heading' => $line['line'],
'level' => $line['level'],
'page' => $line['fromtitle'],
'number' => $line['number']
);
}
return $toc;
}
}
?>