User

BPLinkBot/Source

From Beachapedia

< User:BPLinkBot

Dependencies

BPLinkBot requires the following perl modules to be installed on the system. Some of these may already be installed (core modules) but need to be upgraded to the latest versions. The packages can be obtained from the CPAN website

  • MediaWiki::API
    • JSON
    • Crypt::SSLeay
    • Digest::MD5
    • Test::Simple
  • MediaWiki::Bot

Source Code

This is current as of Version 1.1 (Jan 8, 2011)

#!/usr/bin/perl

use MediaWiki::API;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
use MediaWiki::Bot;

my $mw = MediaWiki::API->new();
$mw->{config}->{api_url} = 'http://www.beachapedia.org/api.php';
$mw->login( { lgname => 'BPLinkBot', lgpassword => 'REDACTED' } )
    || die $mw->{error}->{code} . ': ' . $mw->{error}->{details};
binmode STDOUT, ':utf8';

# Get a list of page Titles from the Wiki:

$mw->list ({ action => 'query', list => 'allpages'}, 
	{ max => 5000, hook => \&article_list });
sub article_list {
    my ($ref) = @_;
    foreach (@$ref) {
      push @titles, $_->{title};
    }
  }
print "Found " . scalar(@titles) . " on the wiki!\n";
 

%links = ();

# Get a list of external links for each page Title
# and build the database:
# keys are URLs, values are pipe-delimited lists of page titles

foreach $title (@titles) {
	my $pagelinks = $mw->api({ action => 'query', prop => 'extlinks', titles => $title },
		{ max => 5000 });
	my ($id,$extlinks) = each( %{ $pagelinks->{query}->{pages} });
  	foreach ( @{ $extlinks->{extlinks} } ) {
		if (exists $links{"$_->{'*'}"}) { 
			$links{"$_->{'*'}"} = $links{"$_->{'*'}"} . '|' . $title;
  		} else {
			$links{"$_->{'*'}"} = $title;
		}	

	}
}
print scalar(keys %links), " links found to external sites.\n\n";

# Logout using API module

$mw->logout();

# Switch to Bot module
 
my $bot = MediaWiki::Bot->new({
       protocol        =>      'http',
       host            =>      'www.beachapedia.org',
       path            =>      '',
 	});
$bot->login({
        username => "BPLinkBot",
        password => 'REDACTED',
       	});
# Clear Badpages hash:

%badpages = ();


# Check each URL:

foreach my $url (keys %links) {
	if ($url =~ m/^mailto:/) {
		next; # Skip email links
	}
	if ($url =~ m[^http://scc.ca.gov/]) {
		next; # CCC links always test as bad
	}
	my @pagesusingURL = split /\|/, $links{"$url"};
	($rawurl, $bookmark) = split /\#/, $url; # We don't want bookmarks confusing the issue


	my $ua = LWP::UserAgent->new();
	$ua->agent("Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13"); #pretend to be a mac version of Firefox
	my $req = HTTP::Request->new(GET => $rawurl);
	$req->referer("http://www.beachapedia.org"); # Tell them where we were referred from
	my $response = $ua->request($req);
	if ($response->is_error()) {
		foreach my $page (@pagesusingURL) {
			$badpages{"$page"} = '1';
			my $talkpagetext = $bot->get_text("Talk:$page");
			$talkpagetext .= "\n\nFound Bad Link: $url\n\n--\~\~\~\~";
			$bot->edit({
				page	=> "Talk:$page",
				text	=> $talkpagetext,
				summary => 'Adding Bad Link Info'
				});
			}
		}
	}

# Add pages with bad links to Category:HasBadLinks
if (keys %badpages) {
foreach my $page (keys %badpages) {
	my $pagetext = $bot->get_text("$page");
	$pagetext .= "\n\[\[Category:HasBadLinks\]\]\n\n";
	$bot->edit({ page => $page, text => $pagetext, summary => 'Added HasBadLinks Category' });
	}
} 


$bot->logout();